pax_global_header00006660000000000000000000000064151431063210014506gustar00rootroot0000000000000052 comment=84906a0fe98cbb5e5eaa2c888c50a1ab32d5d0b7 xdp-tools-1.6.1/000077500000000000000000000000001514310632100134445ustar00rootroot00000000000000xdp-tools-1.6.1/.clang-format000066400000000000000000000074421514310632100160260ustar00rootroot00000000000000# SPDX-License-Identifier: GPL-2.0 # # clang-format configuration file. Intended for clang-format >= 4. # # For more information, see: # # Documentation/process/clang-format.rst # https://clang.llvm.org/docs/ClangFormat.html # https://clang.llvm.org/docs/ClangFormatStyleOptions.html # --- AccessModifierOffset: -4 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false AlignEscapedNewlines: Left # Unknown to clang-format-4.0 AlignOperands: true AlignTrailingComments: false AllowAllParametersOfDeclarationOnNextLine: false AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: None AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: None AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: false BinPackArguments: true BinPackParameters: true BraceWrapping: AfterClass: false AfterControlStatement: false AfterEnum: false AfterFunction: true AfterNamespace: true AfterObjCDeclaration: false AfterStruct: false AfterUnion: false AfterExternBlock: false # Unknown to clang-format-5.0 BeforeCatch: false BeforeElse: false IndentBraces: false SplitEmptyFunction: true # Unknown to clang-format-4.0 SplitEmptyRecord: true # Unknown to clang-format-4.0 SplitEmptyNamespace: true # Unknown to clang-format-4.0 BreakBeforeBinaryOperators: None BreakBeforeBraces: Custom BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0 BreakBeforeTernaryOperators: false BreakConstructorInitializersBeforeComma: false BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0 BreakAfterJavaFieldAnnotations: false BreakStringLiterals: false ColumnLimit: 80 CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false # Unknown to clang-format-4.0 ConstructorInitializerAllOnOneLineOrOnePerLine: false ConstructorInitializerIndentWidth: 8 ContinuationIndentWidth: 8 Cpp11BracedListStyle: false DerivePointerAlignment: false DisableFormat: false ExperimentalAutoDetectBinPacking: false FixNamespaceComments: false # Unknown to clang-format-4.0 ForEachMacros: - 'FOR_EACH_OPTION' - 'FOR_EACH_MAP_KEY' - 'bpf_object__for_each_map' IncludeBlocks: Preserve # Unknown to clang-format-5.0 IncludeCategories: - Regex: '.*' Priority: 1 IncludeIsMainRegex: '(Test)?$' IndentCaseLabels: false IndentPPDirectives: None # Unknown to clang-format-5.0 IndentWidth: 8 IndentWrappedFunctionNames: false JavaScriptQuotes: Leave JavaScriptWrapImports: true KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0 ObjCBlockIndentWidth: 8 ObjCSpaceAfterProperty: true ObjCSpaceBeforeProtocolList: true # Taken from git's rules PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0 PenaltyBreakBeforeFirstCallParameter: 30 PenaltyBreakComment: 10 PenaltyBreakFirstLessLess: 0 PenaltyBreakString: 10 PenaltyExcessCharacter: 100 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Right ReflowComments: false SortIncludes: false SortUsingDeclarations: false # Unknown to clang-format-4.0 SpaceAfterCStyleCast: false SpaceAfterTemplateKeyword: true SpaceBeforeAssignmentOperators: true SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0 SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0 SpaceBeforeParens: ControlStatements SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0 SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: false SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false Standard: Cpp03 TabWidth: 8 UseTab: Always ... xdp-tools-1.6.1/.dockerignore000066400000000000000000000000671514310632100161230ustar00rootroot00000000000000packaging/docker/Dockerfile packaging/docker/docker.sh xdp-tools-1.6.1/.github/000077500000000000000000000000001514310632100150045ustar00rootroot00000000000000xdp-tools-1.6.1/.github/scripts/000077500000000000000000000000001514310632100164735ustar00rootroot00000000000000xdp-tools-1.6.1/.github/scripts/.config000066400000000000000000002276211514310632100177530ustar00rootroot00000000000000# # Automatically generated file; DO NOT EDIT. # Linux/x86 5.9.0-rc1 Kernel Configuration # CONFIG_CC_VERSION_TEXT="gcc (GCC) 8.2.1 20180801 (Red Hat 8.2.1-2)" CONFIG_CC_IS_GCC=y CONFIG_GCC_VERSION=80201 CONFIG_LD_VERSION=230000000 CONFIG_CLANG_VERSION=0 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO=y CONFIG_IRQ_WORK=y CONFIG_BUILDTIME_TABLE_SORT=y CONFIG_THREAD_INFO_IN_TASK=y # # General setup # CONFIG_INIT_ENV_ARG_LIMIT=32 # CONFIG_COMPILE_TEST is not set CONFIG_LOCALVERSION="" CONFIG_LOCALVERSION_AUTO=y CONFIG_BUILD_SALT="" CONFIG_HAVE_KERNEL_GZIP=y CONFIG_HAVE_KERNEL_BZIP2=y CONFIG_HAVE_KERNEL_LZMA=y CONFIG_HAVE_KERNEL_XZ=y CONFIG_HAVE_KERNEL_LZO=y CONFIG_HAVE_KERNEL_LZ4=y CONFIG_HAVE_KERNEL_ZSTD=y CONFIG_KERNEL_GZIP=y # CONFIG_KERNEL_BZIP2 is not set # CONFIG_KERNEL_LZMA is not set # CONFIG_KERNEL_XZ is not set # CONFIG_KERNEL_LZO is not set # CONFIG_KERNEL_LZ4 is not set # CONFIG_KERNEL_ZSTD is not set CONFIG_DEFAULT_INIT="" CONFIG_DEFAULT_HOSTNAME="(none)" CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y CONFIG_POSIX_MQUEUE_SYSCTL=y # CONFIG_WATCH_QUEUE is not set CONFIG_CROSS_MEMORY_ATTACH=y # CONFIG_USELIB is not set CONFIG_AUDIT=y CONFIG_HAVE_ARCH_AUDITSYSCALL=y CONFIG_AUDITSYSCALL=y # # IRQ subsystem # CONFIG_GENERIC_IRQ_PROBE=y CONFIG_GENERIC_IRQ_SHOW=y CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y CONFIG_GENERIC_PENDING_IRQ=y CONFIG_GENERIC_IRQ_MIGRATION=y CONFIG_HARDIRQS_SW_RESEND=y CONFIG_IRQ_DOMAIN=y CONFIG_IRQ_DOMAIN_HIERARCHY=y CONFIG_GENERIC_MSI_IRQ=y CONFIG_GENERIC_MSI_IRQ_DOMAIN=y CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y CONFIG_GENERIC_IRQ_RESERVATION_MODE=y CONFIG_IRQ_FORCED_THREADING=y CONFIG_SPARSE_IRQ=y # CONFIG_GENERIC_IRQ_DEBUGFS is not set # end of IRQ subsystem CONFIG_CLOCKSOURCE_WATCHDOG=y CONFIG_ARCH_CLOCKSOURCE_INIT=y CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y CONFIG_GENERIC_TIME_VSYSCALL=y CONFIG_GENERIC_CLOCKEVENTS=y CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y CONFIG_GENERIC_CMOS_UPDATE=y CONFIG_HAVE_POSIX_CPU_TIMERS_TASK_WORK=y CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y # # Timers subsystem # CONFIG_TICK_ONESHOT=y CONFIG_NO_HZ_COMMON=y # CONFIG_HZ_PERIODIC is not set CONFIG_NO_HZ_IDLE=y # CONFIG_NO_HZ_FULL is not set CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y # end of Timers subsystem # CONFIG_PREEMPT_NONE is not set # CONFIG_PREEMPT_VOLUNTARY is not set CONFIG_PREEMPT=y CONFIG_PREEMPT_COUNT=y CONFIG_PREEMPTION=y # # CPU/Task time and stats accounting # CONFIG_TICK_CPU_ACCOUNTING=y # CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set # CONFIG_IRQ_TIME_ACCOUNTING is not set CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set CONFIG_TASKSTATS=y CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y # CONFIG_PSI is not set # end of CPU/Task time and stats accounting # CONFIG_CPU_ISOLATION is not set # # RCU Subsystem # CONFIG_TREE_RCU=y CONFIG_PREEMPT_RCU=y # CONFIG_RCU_EXPERT is not set CONFIG_SRCU=y CONFIG_TREE_SRCU=y CONFIG_TASKS_RCU_GENERIC=y CONFIG_TASKS_RCU=y CONFIG_TASKS_RUDE_RCU=y CONFIG_RCU_STALL_COMMON=y CONFIG_RCU_NEED_SEGCBLIST=y # end of RCU Subsystem CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y # CONFIG_IKHEADERS is not set CONFIG_LOG_BUF_SHIFT=21 CONFIG_LOG_CPU_MAX_BUF_SHIFT=0 CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y # # Scheduler features # # CONFIG_UCLAMP_TASK is not set # end of Scheduler features CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y CONFIG_CC_HAS_INT128=y CONFIG_ARCH_SUPPORTS_INT128=y CONFIG_NUMA_BALANCING=y # CONFIG_NUMA_BALANCING_DEFAULT_ENABLED is not set CONFIG_CGROUPS=y CONFIG_PAGE_COUNTER=y CONFIG_MEMCG=y CONFIG_MEMCG_SWAP=y CONFIG_MEMCG_KMEM=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_WRITEBACK=y CONFIG_CGROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y # CONFIG_RT_GROUP_SCHED is not set # CONFIG_CGROUP_PIDS is not set # CONFIG_CGROUP_RDMA is not set CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_HUGETLB=y CONFIG_CPUSETS=y CONFIG_PROC_PID_CPUSET=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y CONFIG_CGROUP_BPF=y # CONFIG_CGROUP_DEBUG is not set CONFIG_SOCK_CGROUP_DATA=y CONFIG_NAMESPACES=y CONFIG_UTS_NS=y CONFIG_TIME_NS=y CONFIG_IPC_NS=y CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_NET_NS=y # CONFIG_CHECKPOINT_RESTORE is not set # CONFIG_SCHED_AUTOGROUP is not set # CONFIG_SYSFS_DEPRECATED is not set CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y CONFIG_RD_BZIP2=y CONFIG_RD_LZMA=y CONFIG_RD_XZ=y CONFIG_RD_LZO=y CONFIG_RD_LZ4=y CONFIG_RD_ZSTD=y CONFIG_BOOT_CONFIG=y CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y CONFIG_SYSCTL_EXCEPTION_TRACE=y CONFIG_HAVE_PCSPKR_PLATFORM=y CONFIG_BPF=y CONFIG_EXPERT=y CONFIG_MULTIUSER=y CONFIG_SGETMASK_SYSCALL=y # CONFIG_SYSFS_SYSCALL is not set CONFIG_FHANDLE=y CONFIG_POSIX_TIMERS=y CONFIG_PRINTK=y CONFIG_PRINTK_NMI=y CONFIG_BUG=y CONFIG_ELF_CORE=y CONFIG_PCSPKR_PLATFORM=y CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_FUTEX_PI=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_AIO=y CONFIG_IO_URING=y CONFIG_ADVISE_SYSCALLS=y CONFIG_MEMBARRIER=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y CONFIG_KALLSYMS_BASE_RELATIVE=y CONFIG_BPF_LSM=y CONFIG_BPF_SYSCALL=y CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_JIT_DEFAULT_ON=y CONFIG_USERMODE_DRIVER=y CONFIG_BPF_PRELOAD=y CONFIG_BPF_PRELOAD_UMD=y # CONFIG_USERFAULTFD is not set CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y CONFIG_RSEQ=y # CONFIG_DEBUG_RSEQ is not set # CONFIG_EMBEDDED is not set CONFIG_HAVE_PERF_EVENTS=y # CONFIG_PC104 is not set # # Kernel Performance Events And Counters # CONFIG_PERF_EVENTS=y # CONFIG_DEBUG_PERF_USE_VMALLOC is not set # end of Kernel Performance Events And Counters CONFIG_VM_EVENT_COUNTERS=y CONFIG_SLUB_DEBUG=y # CONFIG_SLUB_MEMCG_SYSFS_ON is not set CONFIG_COMPAT_BRK=y # CONFIG_SLAB is not set CONFIG_SLUB=y # CONFIG_SLOB is not set CONFIG_SLAB_MERGE_DEFAULT=y # CONFIG_SLAB_FREELIST_RANDOM is not set # CONFIG_SLAB_FREELIST_HARDENED is not set # CONFIG_SHUFFLE_PAGE_ALLOCATOR is not set CONFIG_SLUB_CPU_PARTIAL=y CONFIG_PROFILING=y CONFIG_TRACEPOINTS=y # end of General setup CONFIG_64BIT=y CONFIG_X86_64=y CONFIG_X86=y CONFIG_INSTRUCTION_DECODER=y CONFIG_OUTPUT_FORMAT="elf64-x86-64" CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y CONFIG_MMU=y CONFIG_ARCH_MMAP_RND_BITS_MIN=28 CONFIG_ARCH_MMAP_RND_BITS_MAX=32 CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_BUG=y CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y CONFIG_ARCH_MAY_HAVE_PC_FDC=y CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_ARCH_HAS_CPU_RELAX=y CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y CONFIG_ARCH_HAS_FILTER_PGPROT=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_ARCH_HIBERNATION_POSSIBLE=y CONFIG_ARCH_SUSPEND_POSSIBLE=y CONFIG_ARCH_WANT_GENERAL_HUGETLB=y CONFIG_ZONE_DMA32=y CONFIG_AUDIT_ARCH=y CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_X86_64_SMP=y CONFIG_ARCH_SUPPORTS_UPROBES=y CONFIG_FIX_EARLYCON_MEM=y CONFIG_PGTABLE_LEVELS=4 CONFIG_CC_HAS_SANE_STACKPROTECTOR=y # # Processor type and features # CONFIG_ZONE_DMA=y CONFIG_SMP=y CONFIG_X86_FEATURE_NAMES=y CONFIG_X86_MPPARSE=y # CONFIG_GOLDFISH is not set # CONFIG_RETPOLINE is not set # CONFIG_X86_CPU_RESCTRL is not set CONFIG_X86_EXTENDED_PLATFORM=y # CONFIG_X86_VSMP is not set # CONFIG_X86_GOLDFISH is not set # CONFIG_X86_INTEL_LPSS is not set # CONFIG_X86_AMD_PLATFORM_DEVICE is not set # CONFIG_IOSF_MBI is not set CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y CONFIG_SCHED_OMIT_FRAME_POINTER=y # CONFIG_HYPERVISOR_GUEST is not set # CONFIG_MK8 is not set # CONFIG_MPSC is not set CONFIG_MCORE2=y # CONFIG_MATOM is not set # CONFIG_GENERIC_CPU is not set CONFIG_X86_INTERNODE_CACHE_SHIFT=6 CONFIG_X86_L1_CACHE_SHIFT=6 CONFIG_X86_INTEL_USERCOPY=y CONFIG_X86_USE_PPRO_CHECKSUM=y CONFIG_X86_P6_NOP=y CONFIG_X86_TSC=y CONFIG_X86_CMPXCHG64=y CONFIG_X86_CMOV=y CONFIG_X86_MINIMUM_CPU_FAMILY=64 CONFIG_X86_DEBUGCTLMSR=y CONFIG_IA32_FEAT_CTL=y CONFIG_X86_VMX_FEATURE_NAMES=y # CONFIG_PROCESSOR_SELECT is not set CONFIG_CPU_SUP_INTEL=y CONFIG_CPU_SUP_AMD=y CONFIG_CPU_SUP_HYGON=y CONFIG_CPU_SUP_CENTAUR=y CONFIG_CPU_SUP_ZHAOXIN=y CONFIG_HPET_TIMER=y CONFIG_DMI=y CONFIG_GART_IOMMU=y # CONFIG_MAXSMP is not set CONFIG_NR_CPUS_RANGE_BEGIN=2 CONFIG_NR_CPUS_RANGE_END=512 CONFIG_NR_CPUS_DEFAULT=64 CONFIG_NR_CPUS=128 CONFIG_SCHED_SMT=y CONFIG_SCHED_MC=y CONFIG_SCHED_MC_PRIO=y CONFIG_X86_LOCAL_APIC=y CONFIG_X86_IO_APIC=y # CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS is not set CONFIG_X86_MCE=y # CONFIG_X86_MCELOG_LEGACY is not set CONFIG_X86_MCE_INTEL=y CONFIG_X86_MCE_AMD=y CONFIG_X86_MCE_THRESHOLD=y # CONFIG_X86_MCE_INJECT is not set CONFIG_X86_THERMAL_VECTOR=y # # Performance monitoring # CONFIG_PERF_EVENTS_INTEL_UNCORE=y # CONFIG_PERF_EVENTS_INTEL_RAPL is not set # CONFIG_PERF_EVENTS_INTEL_CSTATE is not set # CONFIG_PERF_EVENTS_AMD_POWER is not set # end of Performance monitoring # CONFIG_X86_16BIT is not set CONFIG_X86_VSYSCALL_EMULATION=y CONFIG_X86_IOPL_IOPERM=y # CONFIG_I8K is not set # CONFIG_MICROCODE is not set CONFIG_X86_MSR=y CONFIG_X86_CPUID=y # CONFIG_X86_5LEVEL is not set CONFIG_X86_DIRECT_GBPAGES=y # CONFIG_X86_CPA_STATISTICS is not set # CONFIG_AMD_MEM_ENCRYPT is not set CONFIG_NUMA=y CONFIG_AMD_NUMA=y CONFIG_X86_64_ACPI_NUMA=y # CONFIG_NUMA_EMU is not set CONFIG_NODES_SHIFT=6 CONFIG_ARCH_SPARSEMEM_ENABLE=y CONFIG_ARCH_SPARSEMEM_DEFAULT=y CONFIG_ARCH_SELECT_MEMORY_MODEL=y CONFIG_ARCH_PROC_KCORE_TEXT=y CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 # CONFIG_X86_PMEM_LEGACY is not set # CONFIG_X86_CHECK_BIOS_CORRUPTION is not set CONFIG_X86_RESERVE_LOW=64 CONFIG_MTRR=y # CONFIG_MTRR_SANITIZER is not set CONFIG_X86_PAT=y CONFIG_ARCH_USES_PG_UNCACHED=y CONFIG_ARCH_RANDOM=y CONFIG_X86_SMAP=y CONFIG_X86_UMIP=y # CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS is not set CONFIG_X86_INTEL_TSX_MODE_OFF=y # CONFIG_X86_INTEL_TSX_MODE_ON is not set # CONFIG_X86_INTEL_TSX_MODE_AUTO is not set CONFIG_EFI=y CONFIG_EFI_STUB=y # CONFIG_EFI_MIXED is not set CONFIG_SECCOMP=y # CONFIG_HZ_100 is not set # CONFIG_HZ_250 is not set # CONFIG_HZ_300 is not set CONFIG_HZ_1000=y CONFIG_HZ=1000 CONFIG_SCHED_HRTICK=y CONFIG_KEXEC=y # CONFIG_KEXEC_FILE is not set # CONFIG_CRASH_DUMP is not set CONFIG_PHYSICAL_START=0x1000000 CONFIG_RELOCATABLE=y # CONFIG_RANDOMIZE_BASE is not set CONFIG_PHYSICAL_ALIGN=0x1000000 CONFIG_HOTPLUG_CPU=y # CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set # CONFIG_DEBUG_HOTPLUG_CPU0 is not set # CONFIG_LEGACY_VSYSCALL_EMULATE is not set # CONFIG_LEGACY_VSYSCALL_XONLY is not set CONFIG_LEGACY_VSYSCALL_NONE=y # CONFIG_CMDLINE_BOOL is not set CONFIG_MODIFY_LDT_SYSCALL=y CONFIG_HAVE_LIVEPATCH=y # CONFIG_LIVEPATCH is not set # end of Processor type and features CONFIG_ARCH_HAS_ADD_PAGES=y CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y CONFIG_ARCH_ENABLE_THP_MIGRATION=y # # Power management and ACPI options # # CONFIG_SUSPEND is not set # CONFIG_HIBERNATION is not set # CONFIG_PM is not set # CONFIG_ENERGY_MODEL is not set CONFIG_ARCH_SUPPORTS_ACPI=y CONFIG_ACPI=y CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y # CONFIG_ACPI_DEBUGGER is not set # CONFIG_ACPI_SPCR_TABLE is not set CONFIG_ACPI_LPIT=y # CONFIG_ACPI_REV_OVERRIDE_POSSIBLE is not set # CONFIG_ACPI_EC_DEBUGFS is not set # CONFIG_ACPI_AC is not set # CONFIG_ACPI_BATTERY is not set # CONFIG_ACPI_BUTTON is not set # CONFIG_ACPI_TINY_POWER_BUTTON is not set # CONFIG_ACPI_VIDEO is not set # CONFIG_ACPI_FAN is not set # CONFIG_ACPI_DOCK is not set CONFIG_ACPI_CPU_FREQ_PSS=y CONFIG_ACPI_PROCESSOR_CSTATE=y CONFIG_ACPI_PROCESSOR_IDLE=y CONFIG_ACPI_CPPC_LIB=y CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_HOTPLUG_CPU=y # CONFIG_ACPI_PROCESSOR_AGGREGATOR is not set # CONFIG_ACPI_THERMAL is not set CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y # CONFIG_ACPI_TABLE_UPGRADE is not set # CONFIG_ACPI_DEBUG is not set # CONFIG_ACPI_PCI_SLOT is not set CONFIG_ACPI_CONTAINER=y CONFIG_ACPI_HOTPLUG_IOAPIC=y # CONFIG_ACPI_SBS is not set # CONFIG_ACPI_HED is not set # CONFIG_ACPI_CUSTOM_METHOD is not set # CONFIG_ACPI_BGRT is not set # CONFIG_ACPI_REDUCED_HARDWARE_ONLY is not set # CONFIG_ACPI_NFIT is not set CONFIG_ACPI_NUMA=y # CONFIG_ACPI_HMAT is not set CONFIG_HAVE_ACPI_APEI=y CONFIG_HAVE_ACPI_APEI_NMI=y # CONFIG_ACPI_APEI is not set # CONFIG_DPTF_POWER is not set # CONFIG_PMIC_OPREGION is not set # CONFIG_ACPI_CONFIGFS is not set # CONFIG_X86_PM_TIMER is not set # CONFIG_SFI is not set # # CPU Frequency scaling # CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_GOV_ATTR_SET=y CONFIG_CPU_FREQ_GOV_COMMON=y CONFIG_CPU_FREQ_STAT=y CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set # CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set # CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set # CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set # CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y # # CPU frequency scaling drivers # CONFIG_X86_INTEL_PSTATE=y # CONFIG_X86_PCC_CPUFREQ is not set CONFIG_X86_ACPI_CPUFREQ=y CONFIG_X86_ACPI_CPUFREQ_CPB=y CONFIG_X86_POWERNOW_K8=y # CONFIG_X86_AMD_FREQ_SENSITIVITY is not set # CONFIG_X86_SPEEDSTEP_CENTRINO is not set # CONFIG_X86_P4_CLOCKMOD is not set # # shared options # # end of CPU Frequency scaling # # CPU Idle # CONFIG_CPU_IDLE=y CONFIG_CPU_IDLE_GOV_LADDER=y CONFIG_CPU_IDLE_GOV_MENU=y # CONFIG_CPU_IDLE_GOV_TEO is not set # end of CPU Idle # CONFIG_INTEL_IDLE is not set # end of Power management and ACPI options # # Bus options (PCI etc.) # CONFIG_PCI_DIRECT=y CONFIG_PCI_MMCONFIG=y CONFIG_MMCONF_FAM10H=y # CONFIG_PCI_CNB20LE_QUIRK is not set # CONFIG_ISA_BUS is not set CONFIG_ISA_DMA_API=y CONFIG_AMD_NB=y # CONFIG_X86_SYSFB is not set # end of Bus options (PCI etc.) # # Binary Emulations # # CONFIG_IA32_EMULATION is not set # CONFIG_X86_X32 is not set # end of Binary Emulations # # Firmware Drivers # # CONFIG_EDD is not set CONFIG_FIRMWARE_MEMMAP=y CONFIG_DMIID=y # CONFIG_DMI_SYSFS is not set CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y # CONFIG_FW_CFG_SYSFS is not set # CONFIG_GOOGLE_FIRMWARE is not set # # EFI (Extensible Firmware Interface) Support # # CONFIG_EFI_VARS is not set CONFIG_EFI_ESRT=y CONFIG_EFI_RUNTIME_MAP=y # CONFIG_EFI_FAKE_MEMMAP is not set CONFIG_EFI_RUNTIME_WRAPPERS=y CONFIG_EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER=y # CONFIG_EFI_CAPSULE_LOADER is not set # CONFIG_EFI_TEST is not set # CONFIG_APPLE_PROPERTIES is not set # CONFIG_RESET_ATTACK_MITIGATION is not set # CONFIG_EFI_RCI2_TABLE is not set # CONFIG_EFI_DISABLE_PCI_DMA is not set # end of EFI (Extensible Firmware Interface) Support CONFIG_EFI_EARLYCON=y # # Tegra firmware driver # # end of Tegra firmware driver # end of Firmware Drivers CONFIG_HAVE_KVM=y CONFIG_VIRTUALIZATION=y # CONFIG_KVM is not set CONFIG_KVM_WERROR=y CONFIG_AS_AVX512=y CONFIG_AS_SHA1_NI=y CONFIG_AS_SHA256_NI=y # # General architecture-dependent options # CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y CONFIG_HOTPLUG_SMT=y CONFIG_GENERIC_ENTRY=y # CONFIG_OPROFILE is not set CONFIG_HAVE_OPROFILE=y CONFIG_OPROFILE_NMI_TIMER=y CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y # CONFIG_STATIC_KEYS_SELFTEST is not set CONFIG_OPTPROBES=y CONFIG_KPROBES_ON_FTRACE=y CONFIG_UPROBES=y CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y CONFIG_ARCH_USE_BUILTIN_BSWAP=y CONFIG_KRETPROBES=y CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_OPTPROBES=y CONFIG_HAVE_KPROBES_ON_FTRACE=y CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y CONFIG_HAVE_NMI=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_DMA_CONTIGUOUS=y CONFIG_GENERIC_SMP_IDLE_THREAD=y CONFIG_ARCH_HAS_FORTIFY_SOURCE=y CONFIG_ARCH_HAS_SET_MEMORY=y CONFIG_ARCH_HAS_SET_DIRECT_MAP=y CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y CONFIG_HAVE_ASM_MODVERSIONS=y CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y CONFIG_HAVE_RSEQ=y CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y CONFIG_HAVE_HW_BREAKPOINT=y CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y CONFIG_HAVE_USER_RETURN_NOTIFIER=y CONFIG_HAVE_PERF_EVENTS_NMI=y CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y CONFIG_HAVE_PERF_REGS=y CONFIG_HAVE_PERF_USER_STACK_DUMP=y CONFIG_HAVE_ARCH_JUMP_LABEL=y CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y CONFIG_HAVE_CMPXCHG_LOCAL=y CONFIG_HAVE_CMPXCHG_DOUBLE=y CONFIG_HAVE_ARCH_SECCOMP_FILTER=y CONFIG_SECCOMP_FILTER=y CONFIG_HAVE_ARCH_STACKLEAK=y CONFIG_HAVE_STACKPROTECTOR=y # CONFIG_STACKPROTECTOR is not set CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y CONFIG_HAVE_CONTEXT_TRACKING=y CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y CONFIG_HAVE_MOVE_PMD=y CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y CONFIG_HAVE_ARCH_HUGE_VMAP=y CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y CONFIG_HAVE_ARCH_SOFT_DIRTY=y CONFIG_HAVE_MOD_ARCH_SPECIFIC=y CONFIG_MODULES_USE_ELF_RELA=y CONFIG_ARCH_HAS_ELF_RANDOMIZE=y CONFIG_HAVE_ARCH_MMAP_RND_BITS=y CONFIG_HAVE_EXIT_THREAD=y CONFIG_ARCH_MMAP_RND_BITS=28 CONFIG_HAVE_STACK_VALIDATION=y CONFIG_HAVE_RELIABLE_STACKTRACE=y CONFIG_COMPAT_32BIT_TIME=y CONFIG_HAVE_ARCH_VMAP_STACK=y CONFIG_VMAP_STACK=y CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y CONFIG_STRICT_KERNEL_RWX=y CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y CONFIG_STRICT_MODULE_RWX=y CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y CONFIG_ARCH_USE_MEMREMAP_PROT=y # CONFIG_LOCK_EVENT_COUNTS is not set CONFIG_ARCH_HAS_MEM_ENCRYPT=y # # GCOV-based kernel profiling # # CONFIG_GCOV_KERNEL is not set CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y # end of GCOV-based kernel profiling CONFIG_HAVE_GCC_PLUGINS=y # end of General architecture-dependent options CONFIG_RT_MUTEXES=y CONFIG_BASE_SMALL=0 CONFIG_MODULES=y # CONFIG_MODULE_FORCE_LOAD is not set CONFIG_MODULE_UNLOAD=y # CONFIG_MODULE_FORCE_UNLOAD is not set CONFIG_MODVERSIONS=y CONFIG_ASM_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y # CONFIG_MODULE_SIG is not set # CONFIG_MODULE_COMPRESS is not set # CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS is not set # CONFIG_UNUSED_SYMBOLS is not set # CONFIG_TRIM_UNUSED_KSYMS is not set CONFIG_MODULES_TREE_LOOKUP=y CONFIG_BLOCK=y CONFIG_BLK_SCSI_REQUEST=y CONFIG_BLK_CGROUP_RWSTAT=y CONFIG_BLK_DEV_BSG=y CONFIG_BLK_DEV_BSGLIB=y # CONFIG_BLK_DEV_INTEGRITY is not set # CONFIG_BLK_DEV_ZONED is not set CONFIG_BLK_DEV_THROTTLING=y # CONFIG_BLK_DEV_THROTTLING_LOW is not set # CONFIG_BLK_CMDLINE_PARSER is not set # CONFIG_BLK_WBT is not set CONFIG_BLK_CGROUP_IOLATENCY=y # CONFIG_BLK_CGROUP_IOCOST is not set CONFIG_BLK_DEBUG_FS=y # CONFIG_BLK_SED_OPAL is not set # CONFIG_BLK_INLINE_ENCRYPTION is not set # # Partition Types # CONFIG_PARTITION_ADVANCED=y # CONFIG_ACORN_PARTITION is not set # CONFIG_AIX_PARTITION is not set CONFIG_OSF_PARTITION=y CONFIG_AMIGA_PARTITION=y # CONFIG_ATARI_PARTITION is not set CONFIG_MAC_PARTITION=y CONFIG_MSDOS_PARTITION=y CONFIG_BSD_DISKLABEL=y CONFIG_MINIX_SUBPARTITION=y CONFIG_SOLARIS_X86_PARTITION=y CONFIG_UNIXWARE_DISKLABEL=y # CONFIG_LDM_PARTITION is not set CONFIG_SGI_PARTITION=y # CONFIG_ULTRIX_PARTITION is not set CONFIG_SUN_PARTITION=y CONFIG_KARMA_PARTITION=y CONFIG_EFI_PARTITION=y # CONFIG_SYSV68_PARTITION is not set # CONFIG_CMDLINE_PARTITION is not set # end of Partition Types CONFIG_BLK_MQ_PCI=y CONFIG_BLK_MQ_VIRTIO=y # # IO Schedulers # CONFIG_MQ_IOSCHED_DEADLINE=y CONFIG_MQ_IOSCHED_KYBER=y # CONFIG_IOSCHED_BFQ is not set # end of IO Schedulers CONFIG_ASN1=y CONFIG_UNINLINE_SPIN_UNLOCK=y CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y CONFIG_MUTEX_SPIN_ON_OWNER=y CONFIG_RWSEM_SPIN_ON_OWNER=y CONFIG_LOCK_SPIN_ON_OWNER=y CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y CONFIG_QUEUED_SPINLOCKS=y CONFIG_ARCH_USE_QUEUED_RWLOCKS=y CONFIG_QUEUED_RWLOCKS=y CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y CONFIG_FREEZER=y # # Executable file formats # CONFIG_BINFMT_ELF=y CONFIG_ELFCORE=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set CONFIG_BINFMT_SCRIPT=y CONFIG_BINFMT_MISC=y CONFIG_COREDUMP=y # end of Executable file formats # # Memory Management options # CONFIG_SELECT_MEMORY_MODEL=y CONFIG_SPARSEMEM_MANUAL=y CONFIG_SPARSEMEM=y CONFIG_NEED_MULTIPLE_NODES=y CONFIG_SPARSEMEM_EXTREME=y CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y CONFIG_SPARSEMEM_VMEMMAP=y CONFIG_HAVE_FAST_GUP=y CONFIG_MEMORY_ISOLATION=y # CONFIG_MEMORY_HOTPLUG is not set CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MEMORY_BALLOON=y CONFIG_BALLOON_COMPACTION=y CONFIG_COMPACTION=y CONFIG_PAGE_REPORTING=y CONFIG_MIGRATION=y CONFIG_CONTIG_ALLOC=y CONFIG_PHYS_ADDR_T_64BIT=y CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_KSM=y CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y CONFIG_MEMORY_FAILURE=y CONFIG_HWPOISON_INJECT=y CONFIG_TRANSPARENT_HUGEPAGE=y # CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y CONFIG_ARCH_WANTS_THP_SWAP=y CONFIG_THP_SWAP=y # CONFIG_CLEANCACHE is not set # CONFIG_FRONTSWAP is not set CONFIG_CMA=y # CONFIG_CMA_DEBUG is not set # CONFIG_CMA_DEBUGFS is not set CONFIG_CMA_AREAS=7 # CONFIG_ZPOOL is not set # CONFIG_ZBUD is not set # CONFIG_ZSMALLOC is not set CONFIG_GENERIC_EARLY_IOREMAP=y # CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set # CONFIG_IDLE_PAGE_TRACKING is not set CONFIG_ARCH_HAS_PTE_DEVMAP=y # CONFIG_PERCPU_STATS is not set # CONFIG_GUP_BENCHMARK is not set # CONFIG_READ_ONLY_THP_FOR_FS is not set CONFIG_ARCH_HAS_PTE_SPECIAL=y # end of Memory Management options CONFIG_NET=y CONFIG_NET_INGRESS=y CONFIG_NET_EGRESS=y CONFIG_SKB_EXTENSIONS=y # # Networking options # CONFIG_PACKET=y # CONFIG_PACKET_DIAG is not set CONFIG_UNIX=y CONFIG_UNIX_SCM=y # CONFIG_UNIX_DIAG is not set CONFIG_TLS=y # CONFIG_TLS_DEVICE is not set # CONFIG_TLS_TOE is not set CONFIG_XFRM=y CONFIG_XFRM_ALGO=y CONFIG_XFRM_USER=y # CONFIG_XFRM_INTERFACE is not set CONFIG_XFRM_SUB_POLICY=y # CONFIG_XFRM_MIGRATE is not set # CONFIG_XFRM_STATISTICS is not set # CONFIG_NET_KEY is not set CONFIG_XDP_SOCKETS=y CONFIG_XDP_SOCKETS_DIAG=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y # CONFIG_IP_FIB_TRIE_STATS is not set CONFIG_IP_MULTIPLE_TABLES=y CONFIG_IP_ROUTE_MULTIPATH=y CONFIG_IP_ROUTE_VERBOSE=y # CONFIG_IP_PNP is not set CONFIG_NET_IPIP=y CONFIG_NET_IPGRE_DEMUX=y CONFIG_NET_IP_TUNNEL=y CONFIG_NET_IPGRE=y CONFIG_NET_IPGRE_BROADCAST=y CONFIG_IP_MROUTE_COMMON=y CONFIG_IP_MROUTE=y # CONFIG_IP_MROUTE_MULTIPLE_TABLES is not set CONFIG_IP_PIMSM_V1=y CONFIG_IP_PIMSM_V2=y CONFIG_SYN_COOKIES=y # CONFIG_NET_IPVTI is not set CONFIG_NET_UDP_TUNNEL=y # CONFIG_NET_FOU is not set # CONFIG_NET_FOU_IP_TUNNELS is not set # CONFIG_INET_AH is not set # CONFIG_INET_ESP is not set # CONFIG_INET_IPCOMP is not set CONFIG_INET_TUNNEL=y CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y # CONFIG_INET_UDP_DIAG is not set # CONFIG_INET_RAW_DIAG is not set # CONFIG_INET_DIAG_DESTROY is not set CONFIG_TCP_CONG_ADVANCED=y CONFIG_TCP_CONG_BIC=m CONFIG_TCP_CONG_CUBIC=y CONFIG_TCP_CONG_WESTWOOD=m CONFIG_TCP_CONG_HTCP=m # CONFIG_TCP_CONG_HSTCP is not set # CONFIG_TCP_CONG_HYBLA is not set # CONFIG_TCP_CONG_VEGAS is not set # CONFIG_TCP_CONG_NV is not set # CONFIG_TCP_CONG_SCALABLE is not set # CONFIG_TCP_CONG_LP is not set # CONFIG_TCP_CONG_VENO is not set # CONFIG_TCP_CONG_YEAH is not set # CONFIG_TCP_CONG_ILLINOIS is not set # CONFIG_TCP_CONG_DCTCP is not set # CONFIG_TCP_CONG_CDG is not set # CONFIG_TCP_CONG_BBR is not set # CONFIG_DEFAULT_CUBIC is not set CONFIG_DEFAULT_RENO=y CONFIG_DEFAULT_TCP_CONG="reno" CONFIG_TCP_MD5SIG=y CONFIG_IPV6=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y # CONFIG_IPV6_OPTIMISTIC_DAD is not set # CONFIG_INET6_AH is not set # CONFIG_INET6_ESP is not set # CONFIG_INET6_IPCOMP is not set CONFIG_IPV6_MIP6=y # CONFIG_IPV6_ILA is not set CONFIG_INET6_TUNNEL=y # CONFIG_IPV6_VTI is not set CONFIG_IPV6_SIT=y # CONFIG_IPV6_SIT_6RD is not set CONFIG_IPV6_NDISC_NODETYPE=y CONFIG_IPV6_TUNNEL=y CONFIG_IPV6_GRE=y CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_IPV6_SUBTREES=y # CONFIG_IPV6_MROUTE is not set CONFIG_IPV6_SEG6_LWTUNNEL=y # CONFIG_IPV6_SEG6_HMAC is not set CONFIG_IPV6_SEG6_BPF=y # CONFIG_IPV6_RPL_LWTUNNEL is not set CONFIG_NETLABEL=y # CONFIG_MPTCP is not set CONFIG_NETWORK_SECMARK=y CONFIG_NET_PTP_CLASSIFY=y # CONFIG_NETWORK_PHY_TIMESTAMPING is not set CONFIG_NETFILTER=y CONFIG_NETFILTER_ADVANCED=y # # Core Netfilter Configuration # CONFIG_NETFILTER_INGRESS=y CONFIG_NETFILTER_NETLINK=y # CONFIG_NETFILTER_NETLINK_ACCT is not set CONFIG_NETFILTER_NETLINK_QUEUE=y CONFIG_NETFILTER_NETLINK_LOG=y # CONFIG_NETFILTER_NETLINK_OSF is not set # CONFIG_NF_CONNTRACK is not set # CONFIG_NF_LOG_NETDEV is not set # CONFIG_NF_TABLES is not set CONFIG_NETFILTER_XTABLES=y # # Xtables combined modules # # CONFIG_NETFILTER_XT_MARK is not set # # Xtables targets # # CONFIG_NETFILTER_XT_TARGET_AUDIT is not set # CONFIG_NETFILTER_XT_TARGET_CLASSIFY is not set # CONFIG_NETFILTER_XT_TARGET_HMARK is not set # CONFIG_NETFILTER_XT_TARGET_IDLETIMER is not set # CONFIG_NETFILTER_XT_TARGET_LOG is not set # CONFIG_NETFILTER_XT_TARGET_MARK is not set # CONFIG_NETFILTER_XT_TARGET_NFLOG is not set # CONFIG_NETFILTER_XT_TARGET_NFQUEUE is not set # CONFIG_NETFILTER_XT_TARGET_RATEEST is not set # CONFIG_NETFILTER_XT_TARGET_TEE is not set # CONFIG_NETFILTER_XT_TARGET_SECMARK is not set # CONFIG_NETFILTER_XT_TARGET_TCPMSS is not set # # Xtables matches # # CONFIG_NETFILTER_XT_MATCH_ADDRTYPE is not set CONFIG_NETFILTER_XT_MATCH_BPF=y # CONFIG_NETFILTER_XT_MATCH_CGROUP is not set # CONFIG_NETFILTER_XT_MATCH_COMMENT is not set # CONFIG_NETFILTER_XT_MATCH_CPU is not set # CONFIG_NETFILTER_XT_MATCH_DCCP is not set # CONFIG_NETFILTER_XT_MATCH_DEVGROUP is not set # CONFIG_NETFILTER_XT_MATCH_DSCP is not set # CONFIG_NETFILTER_XT_MATCH_ECN is not set # CONFIG_NETFILTER_XT_MATCH_ESP is not set # CONFIG_NETFILTER_XT_MATCH_HASHLIMIT is not set # CONFIG_NETFILTER_XT_MATCH_HL is not set # CONFIG_NETFILTER_XT_MATCH_IPCOMP is not set # CONFIG_NETFILTER_XT_MATCH_IPRANGE is not set # CONFIG_NETFILTER_XT_MATCH_L2TP is not set # CONFIG_NETFILTER_XT_MATCH_LENGTH is not set # CONFIG_NETFILTER_XT_MATCH_LIMIT is not set # CONFIG_NETFILTER_XT_MATCH_MAC is not set # CONFIG_NETFILTER_XT_MATCH_MARK is not set # CONFIG_NETFILTER_XT_MATCH_MULTIPORT is not set # CONFIG_NETFILTER_XT_MATCH_NFACCT is not set # CONFIG_NETFILTER_XT_MATCH_OSF is not set # CONFIG_NETFILTER_XT_MATCH_OWNER is not set # CONFIG_NETFILTER_XT_MATCH_POLICY is not set # CONFIG_NETFILTER_XT_MATCH_PKTTYPE is not set # CONFIG_NETFILTER_XT_MATCH_QUOTA is not set # CONFIG_NETFILTER_XT_MATCH_RATEEST is not set # CONFIG_NETFILTER_XT_MATCH_REALM is not set # CONFIG_NETFILTER_XT_MATCH_RECENT is not set # CONFIG_NETFILTER_XT_MATCH_SCTP is not set # CONFIG_NETFILTER_XT_MATCH_SOCKET is not set CONFIG_NETFILTER_XT_MATCH_STATISTIC=y # CONFIG_NETFILTER_XT_MATCH_STRING is not set # CONFIG_NETFILTER_XT_MATCH_TCPMSS is not set # CONFIG_NETFILTER_XT_MATCH_TIME is not set # CONFIG_NETFILTER_XT_MATCH_U32 is not set # end of Core Netfilter Configuration # CONFIG_IP_SET is not set # CONFIG_IP_VS is not set # # IP: Netfilter Configuration # # CONFIG_NF_SOCKET_IPV4 is not set # CONFIG_NF_TPROXY_IPV4 is not set # CONFIG_NF_DUP_IPV4 is not set # CONFIG_NF_LOG_ARP is not set # CONFIG_NF_LOG_IPV4 is not set # CONFIG_NF_REJECT_IPV4 is not set CONFIG_IP_NF_IPTABLES=y # CONFIG_IP_NF_MATCH_AH is not set # CONFIG_IP_NF_MATCH_ECN is not set # CONFIG_IP_NF_MATCH_TTL is not set # CONFIG_IP_NF_FILTER is not set # CONFIG_IP_NF_MANGLE is not set # CONFIG_IP_NF_RAW is not set # CONFIG_IP_NF_SECURITY is not set # CONFIG_IP_NF_ARPTABLES is not set # end of IP: Netfilter Configuration # # IPv6: Netfilter Configuration # # CONFIG_NF_SOCKET_IPV6 is not set # CONFIG_NF_TPROXY_IPV6 is not set # CONFIG_NF_DUP_IPV6 is not set # CONFIG_NF_REJECT_IPV6 is not set # CONFIG_NF_LOG_IPV6 is not set CONFIG_IP6_NF_IPTABLES=y # CONFIG_IP6_NF_MATCH_AH is not set # CONFIG_IP6_NF_MATCH_EUI64 is not set # CONFIG_IP6_NF_MATCH_FRAG is not set # CONFIG_IP6_NF_MATCH_OPTS is not set # CONFIG_IP6_NF_MATCH_HL is not set # CONFIG_IP6_NF_MATCH_IPV6HEADER is not set # CONFIG_IP6_NF_MATCH_MH is not set # CONFIG_IP6_NF_MATCH_RT is not set # CONFIG_IP6_NF_MATCH_SRH is not set # CONFIG_IP6_NF_FILTER is not set # CONFIG_IP6_NF_MANGLE is not set # CONFIG_IP6_NF_RAW is not set # CONFIG_IP6_NF_SECURITY is not set # end of IPv6: Netfilter Configuration CONFIG_BPFILTER=y CONFIG_BPFILTER_UMH=m # CONFIG_IP_DCCP is not set # CONFIG_IP_SCTP is not set # CONFIG_RDS is not set # CONFIG_TIPC is not set # CONFIG_ATM is not set # CONFIG_L2TP is not set # CONFIG_BRIDGE is not set CONFIG_HAVE_NET_DSA=y # CONFIG_NET_DSA is not set CONFIG_VLAN_8021Q=y # CONFIG_VLAN_8021Q_GVRP is not set # CONFIG_VLAN_8021Q_MVRP is not set # CONFIG_DECNET is not set # CONFIG_LLC2 is not set # CONFIG_ATALK is not set # CONFIG_X25 is not set # CONFIG_LAPB is not set # CONFIG_PHONET is not set # CONFIG_6LOWPAN is not set # CONFIG_IEEE802154 is not set CONFIG_NET_SCHED=y # # Queueing/Scheduling # # CONFIG_NET_SCH_CBQ is not set # CONFIG_NET_SCH_HTB is not set # CONFIG_NET_SCH_HFSC is not set # CONFIG_NET_SCH_PRIO is not set # CONFIG_NET_SCH_MULTIQ is not set # CONFIG_NET_SCH_RED is not set # CONFIG_NET_SCH_SFB is not set # CONFIG_NET_SCH_SFQ is not set # CONFIG_NET_SCH_TEQL is not set # CONFIG_NET_SCH_TBF is not set # CONFIG_NET_SCH_CBS is not set # CONFIG_NET_SCH_ETF is not set # CONFIG_NET_SCH_TAPRIO is not set # CONFIG_NET_SCH_GRED is not set # CONFIG_NET_SCH_DSMARK is not set # CONFIG_NET_SCH_NETEM is not set # CONFIG_NET_SCH_DRR is not set # CONFIG_NET_SCH_MQPRIO is not set # CONFIG_NET_SCH_SKBPRIO is not set # CONFIG_NET_SCH_CHOKE is not set # CONFIG_NET_SCH_QFQ is not set # CONFIG_NET_SCH_CODEL is not set CONFIG_NET_SCH_FQ_CODEL=y # CONFIG_NET_SCH_CAKE is not set # CONFIG_NET_SCH_FQ is not set # CONFIG_NET_SCH_HHF is not set # CONFIG_NET_SCH_PIE is not set CONFIG_NET_SCH_INGRESS=y # CONFIG_NET_SCH_PLUG is not set # CONFIG_NET_SCH_ETS is not set CONFIG_NET_SCH_DEFAULT=y CONFIG_DEFAULT_FQ_CODEL=y # CONFIG_DEFAULT_PFIFO_FAST is not set CONFIG_DEFAULT_NET_SCH="fq_codel" # # Classification # CONFIG_NET_CLS=y # CONFIG_NET_CLS_BASIC is not set # CONFIG_NET_CLS_TCINDEX is not set # CONFIG_NET_CLS_ROUTE4 is not set # CONFIG_NET_CLS_FW is not set # CONFIG_NET_CLS_U32 is not set # CONFIG_NET_CLS_RSVP is not set # CONFIG_NET_CLS_RSVP6 is not set # CONFIG_NET_CLS_FLOW is not set CONFIG_NET_CLS_CGROUP=y CONFIG_NET_CLS_BPF=y # CONFIG_NET_CLS_FLOWER is not set # CONFIG_NET_CLS_MATCHALL is not set CONFIG_NET_EMATCH=y CONFIG_NET_EMATCH_STACK=32 # CONFIG_NET_EMATCH_CMP is not set # CONFIG_NET_EMATCH_NBYTE is not set # CONFIG_NET_EMATCH_U32 is not set # CONFIG_NET_EMATCH_META is not set # CONFIG_NET_EMATCH_TEXT is not set # CONFIG_NET_EMATCH_IPT is not set CONFIG_NET_CLS_ACT=y # CONFIG_NET_ACT_POLICE is not set # CONFIG_NET_ACT_GACT is not set # CONFIG_NET_ACT_MIRRED is not set # CONFIG_NET_ACT_SAMPLE is not set # CONFIG_NET_ACT_IPT is not set # CONFIG_NET_ACT_NAT is not set # CONFIG_NET_ACT_PEDIT is not set # CONFIG_NET_ACT_SIMP is not set # CONFIG_NET_ACT_SKBEDIT is not set # CONFIG_NET_ACT_CSUM is not set # CONFIG_NET_ACT_MPLS is not set # CONFIG_NET_ACT_VLAN is not set CONFIG_NET_ACT_BPF=y # CONFIG_NET_ACT_SKBMOD is not set # CONFIG_NET_ACT_IFE is not set # CONFIG_NET_ACT_TUNNEL_KEY is not set # CONFIG_NET_ACT_GATE is not set CONFIG_NET_TC_SKB_EXT=y CONFIG_NET_SCH_FIFO=y CONFIG_DCB=y CONFIG_DNS_RESOLVER=y # CONFIG_BATMAN_ADV is not set # CONFIG_OPENVSWITCH is not set # CONFIG_VSOCKETS is not set # CONFIG_NETLINK_DIAG is not set CONFIG_MPLS=y # CONFIG_NET_MPLS_GSO is not set # CONFIG_MPLS_ROUTING is not set # CONFIG_NET_NSH is not set # CONFIG_HSR is not set # CONFIG_NET_SWITCHDEV is not set # CONFIG_NET_L3_MASTER_DEV is not set # CONFIG_QRTR is not set # CONFIG_NET_NCSI is not set CONFIG_RPS=y CONFIG_RFS_ACCEL=y CONFIG_XPS=y # CONFIG_CGROUP_NET_PRIO is not set CONFIG_CGROUP_NET_CLASSID=y CONFIG_NET_RX_BUSY_POLL=y CONFIG_BQL=y CONFIG_BPF_JIT=y CONFIG_BPF_STREAM_PARSER=y CONFIG_NET_FLOW_LIMIT=y # # Network testing # # CONFIG_NET_PKTGEN is not set # CONFIG_NET_DROP_MONITOR is not set # end of Network testing # end of Networking options # CONFIG_HAMRADIO is not set # CONFIG_CAN is not set # CONFIG_BT is not set # CONFIG_AF_RXRPC is not set # CONFIG_AF_KCM is not set CONFIG_STREAM_PARSER=y CONFIG_FIB_RULES=y CONFIG_WIRELESS=y # CONFIG_CFG80211 is not set # # CFG80211 needs to be enabled for MAC80211 # CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 # CONFIG_WIMAX is not set # CONFIG_RFKILL is not set CONFIG_NET_9P=y CONFIG_NET_9P_VIRTIO=y # CONFIG_NET_9P_DEBUG is not set # CONFIG_CAIF is not set # CONFIG_CEPH_LIB is not set # CONFIG_NFC is not set # CONFIG_PSAMPLE is not set # CONFIG_NET_IFE is not set CONFIG_LWTUNNEL=y CONFIG_LWTUNNEL_BPF=y CONFIG_DST_CACHE=y CONFIG_GRO_CELLS=y CONFIG_NET_SOCK_MSG=y CONFIG_NET_DEVLINK=y CONFIG_FAILOVER=y CONFIG_ETHTOOL_NETLINK=y CONFIG_HAVE_EBPF_JIT=y # # Device Drivers # CONFIG_HAVE_EISA=y # CONFIG_EISA is not set CONFIG_HAVE_PCI=y CONFIG_PCI=y CONFIG_PCI_DOMAINS=y CONFIG_PCIEPORTBUS=y # CONFIG_PCIEAER is not set CONFIG_PCIEASPM=y CONFIG_PCIEASPM_DEFAULT=y # CONFIG_PCIEASPM_POWERSAVE is not set # CONFIG_PCIEASPM_POWER_SUPERSAVE is not set # CONFIG_PCIEASPM_PERFORMANCE is not set # CONFIG_PCIE_PTM is not set # CONFIG_PCIE_BW is not set CONFIG_PCI_MSI=y CONFIG_PCI_MSI_IRQ_DOMAIN=y CONFIG_PCI_QUIRKS=y # CONFIG_PCI_DEBUG is not set # CONFIG_PCI_REALLOC_ENABLE_AUTO is not set # CONFIG_PCI_STUB is not set # CONFIG_PCI_PF_STUB is not set CONFIG_PCI_ATS=y CONFIG_PCI_LOCKLESS_CONFIG=y CONFIG_PCI_IOV=y # CONFIG_PCI_PRI is not set # CONFIG_PCI_PASID is not set CONFIG_PCI_LABEL=y # CONFIG_HOTPLUG_PCI is not set # # PCI controller drivers # # CONFIG_VMD is not set # # DesignWare PCI Core Support # # CONFIG_PCIE_DW_PLAT_HOST is not set # CONFIG_PCI_MESON is not set # end of DesignWare PCI Core Support # # Mobiveil PCIe Core Support # # end of Mobiveil PCIe Core Support # # Cadence PCIe controllers support # # end of Cadence PCIe controllers support # end of PCI controller drivers # # PCI Endpoint # # CONFIG_PCI_ENDPOINT is not set # end of PCI Endpoint # # PCI switch controller drivers # # CONFIG_PCI_SW_SWITCHTEC is not set # end of PCI switch controller drivers # CONFIG_PCCARD is not set # CONFIG_RAPIDIO is not set # # Generic Driver Options # # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_STANDALONE=y # CONFIG_PREVENT_FIRMWARE_BUILD is not set # # Firmware loader # CONFIG_FW_LOADER=y CONFIG_FW_LOADER_PAGED_BUF=y CONFIG_EXTRA_FIRMWARE="" CONFIG_FW_LOADER_USER_HELPER=y # CONFIG_FW_LOADER_USER_HELPER_FALLBACK is not set # CONFIG_FW_LOADER_COMPRESS is not set # end of Firmware loader CONFIG_ALLOW_DEV_COREDUMP=y # CONFIG_DEBUG_DRIVER is not set # CONFIG_DEBUG_DEVRES is not set # CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set # CONFIG_TEST_ASYNC_DRIVER_PROBE is not set CONFIG_GENERIC_CPU_AUTOPROBE=y CONFIG_GENERIC_CPU_VULNERABILITIES=y CONFIG_DMA_SHARED_BUFFER=y # CONFIG_DMA_FENCE_TRACE is not set # end of Generic Driver Options # # Bus devices # # CONFIG_MHI_BUS is not set # end of Bus devices # CONFIG_CONNECTOR is not set # CONFIG_GNSS is not set # CONFIG_MTD is not set # CONFIG_OF is not set CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y # CONFIG_PARPORT is not set CONFIG_PNP=y # CONFIG_PNP_DEBUG_MESSAGES is not set # # Protocols # CONFIG_PNPACPI=y CONFIG_BLK_DEV=y # CONFIG_BLK_DEV_NULL_BLK is not set # CONFIG_BLK_DEV_FD is not set # CONFIG_BLK_DEV_PCIESSD_MTIP32XX is not set # CONFIG_BLK_DEV_UMEM is not set CONFIG_BLK_DEV_LOOP=y # CONFIG_BLK_DEV_DRBD is not set # CONFIG_BLK_DEV_NBD is not set # CONFIG_BLK_DEV_SKD is not set # CONFIG_BLK_DEV_SX8 is not set CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=16384 # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set CONFIG_VIRTIO_BLK=y # CONFIG_BLK_DEV_RBD is not set # CONFIG_BLK_DEV_RSXX is not set # # NVME Support # # CONFIG_BLK_DEV_NVME is not set # CONFIG_NVME_FC is not set # end of NVME Support # # Misc devices # # CONFIG_DUMMY_IRQ is not set # CONFIG_IBM_ASM is not set # CONFIG_PHANTOM is not set # CONFIG_TIFM_CORE is not set # CONFIG_ENCLOSURE_SERVICES is not set # CONFIG_HP_ILO is not set # CONFIG_SRAM is not set # CONFIG_PCI_ENDPOINT_TEST is not set # CONFIG_XILINX_SDFEC is not set # CONFIG_PVPANIC is not set # CONFIG_C2PORT is not set # # EEPROM support # # CONFIG_EEPROM_93CX6 is not set # end of EEPROM support # CONFIG_CB710_CORE is not set # # Texas Instruments shared transport line discipline # # end of Texas Instruments shared transport line discipline # # Altera FPGA firmware download module (requires I2C) # # CONFIG_INTEL_MEI is not set # CONFIG_INTEL_MEI_ME is not set # CONFIG_INTEL_MEI_TXE is not set # CONFIG_VMWARE_VMCI is not set # # Intel MIC & related support # # CONFIG_INTEL_MIC_BUS is not set # CONFIG_SCIF_BUS is not set # CONFIG_VOP_BUS is not set # end of Intel MIC & related support # CONFIG_GENWQE is not set # CONFIG_ECHO is not set # CONFIG_MISC_ALCOR_PCI is not set # CONFIG_MISC_RTSX_PCI is not set # CONFIG_HABANA_AI is not set # end of Misc devices CONFIG_HAVE_IDE=y # CONFIG_IDE is not set # # SCSI device support # CONFIG_SCSI_MOD=y # CONFIG_RAID_ATTRS is not set # CONFIG_SCSI is not set # end of SCSI device support # CONFIG_ATA is not set # CONFIG_MD is not set # CONFIG_TARGET_CORE is not set # CONFIG_FUSION is not set # # IEEE 1394 (FireWire) support # # CONFIG_FIREWIRE is not set # CONFIG_FIREWIRE_NOSY is not set # end of IEEE 1394 (FireWire) support # CONFIG_MACINTOSH_DRIVERS is not set CONFIG_NETDEVICES=y CONFIG_NET_CORE=y # CONFIG_BONDING is not set # CONFIG_DUMMY is not set # CONFIG_WIREGUARD is not set # CONFIG_EQUALIZER is not set # CONFIG_IFB is not set # CONFIG_NET_TEAM is not set # CONFIG_MACVLAN is not set # CONFIG_IPVLAN is not set CONFIG_VXLAN=y # CONFIG_GENEVE is not set # CONFIG_BAREUDP is not set # CONFIG_GTP is not set # CONFIG_MACSEC is not set # CONFIG_NETCONSOLE is not set CONFIG_TUN=y # CONFIG_TUN_VNET_CROSS_LE is not set CONFIG_VETH=y CONFIG_VIRTIO_NET=y # CONFIG_NLMON is not set # CONFIG_ARCNET is not set # # Distributed Switch Architecture drivers # # end of Distributed Switch Architecture drivers # CONFIG_ETHERNET is not set # CONFIG_FDDI is not set # CONFIG_HIPPI is not set # CONFIG_NET_SB1000 is not set # CONFIG_MDIO_DEVICE is not set # CONFIG_PHYLIB is not set # CONFIG_PPP is not set # CONFIG_SLIP is not set # # Host-side USB support is needed for USB Network Adapter support # # CONFIG_WLAN is not set # # Enable WiMAX (Networking options) to see the WiMAX drivers # # CONFIG_WAN is not set # CONFIG_VMXNET3 is not set # CONFIG_FUJITSU_ES is not set CONFIG_NETDEVSIM=y CONFIG_NET_FAILOVER=y # CONFIG_ISDN is not set # CONFIG_NVM is not set # # Input device support # CONFIG_INPUT=y CONFIG_INPUT_FF_MEMLESS=y # CONFIG_INPUT_POLLDEV is not set # CONFIG_INPUT_SPARSEKMAP is not set # CONFIG_INPUT_MATRIXKMAP is not set # # Userland interfaces # # CONFIG_INPUT_MOUSEDEV is not set # CONFIG_INPUT_JOYDEV is not set CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_EVBUG is not set # # Input Device Drivers # # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_INPUT_JOYSTICK is not set # CONFIG_INPUT_TABLET is not set # CONFIG_INPUT_TOUCHSCREEN is not set # CONFIG_INPUT_MISC is not set # CONFIG_RMI4_CORE is not set # # Hardware I/O ports # CONFIG_SERIO=y CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y CONFIG_SERIO_I8042=y CONFIG_SERIO_SERPORT=y # CONFIG_SERIO_CT82C710 is not set # CONFIG_SERIO_PCIPS2 is not set CONFIG_SERIO_LIBPS2=y # CONFIG_SERIO_RAW is not set # CONFIG_SERIO_ALTERA_PS2 is not set # CONFIG_SERIO_PS2MULT is not set # CONFIG_SERIO_ARC_PS2 is not set # CONFIG_USERIO is not set # CONFIG_GAMEPORT is not set # end of Hardware I/O ports # end of Input device support # # Character devices # CONFIG_TTY=y CONFIG_VT=y CONFIG_CONSOLE_TRANSLATIONS=y CONFIG_VT_CONSOLE=y CONFIG_HW_CONSOLE=y CONFIG_VT_HW_CONSOLE_BINDING=y CONFIG_UNIX98_PTYS=y # CONFIG_LEGACY_PTYS is not set CONFIG_LDISC_AUTOLOAD=y # # Serial drivers # CONFIG_SERIAL_EARLYCON=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_DEPRECATED_OPTIONS=y CONFIG_SERIAL_8250_PNP=y # CONFIG_SERIAL_8250_16550A_VARIANTS is not set # CONFIG_SERIAL_8250_FINTEK is not set CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_PCI=y CONFIG_SERIAL_8250_EXAR=y CONFIG_SERIAL_8250_NR_UARTS=32 CONFIG_SERIAL_8250_RUNTIME_UARTS=4 CONFIG_SERIAL_8250_EXTENDED=y CONFIG_SERIAL_8250_MANY_PORTS=y CONFIG_SERIAL_8250_SHARE_IRQ=y CONFIG_SERIAL_8250_DETECT_IRQ=y CONFIG_SERIAL_8250_RSA=y # CONFIG_SERIAL_8250_DW is not set # CONFIG_SERIAL_8250_RT288X is not set # CONFIG_SERIAL_8250_LPSS is not set # CONFIG_SERIAL_8250_MID is not set # # Non-8250 serial port support # # CONFIG_SERIAL_UARTLITE is not set CONFIG_SERIAL_CORE=y CONFIG_SERIAL_CORE_CONSOLE=y # CONFIG_SERIAL_JSM is not set # CONFIG_SERIAL_LANTIQ is not set # CONFIG_SERIAL_SCCNXP is not set # CONFIG_SERIAL_ALTERA_JTAGUART is not set # CONFIG_SERIAL_ALTERA_UART is not set # CONFIG_SERIAL_ARC is not set # CONFIG_SERIAL_RP2 is not set # CONFIG_SERIAL_FSL_LPUART is not set # CONFIG_SERIAL_FSL_LINFLEXUART is not set # CONFIG_SERIAL_SPRD is not set # end of Serial drivers CONFIG_SERIAL_NONSTANDARD=y # CONFIG_ROCKETPORT is not set # CONFIG_CYCLADES is not set # CONFIG_MOXA_INTELLIO is not set # CONFIG_MOXA_SMARTIO is not set # CONFIG_SYNCLINK is not set # CONFIG_SYNCLINKMP is not set # CONFIG_SYNCLINK_GT is not set # CONFIG_ISI is not set # CONFIG_N_HDLC is not set # CONFIG_N_GSM is not set # CONFIG_NOZOMI is not set # CONFIG_NULL_TTY is not set # CONFIG_TRACE_SINK is not set CONFIG_HVC_DRIVER=y # CONFIG_SERIAL_DEV_BUS is not set # CONFIG_TTY_PRINTK is not set CONFIG_VIRTIO_CONSOLE=y # CONFIG_IPMI_HANDLER is not set # CONFIG_HW_RANDOM is not set # CONFIG_APPLICOM is not set # CONFIG_MWAVE is not set CONFIG_DEVMEM=y CONFIG_DEVKMEM=y # CONFIG_NVRAM is not set # CONFIG_RAW_DRIVER is not set CONFIG_DEVPORT=y CONFIG_HPET=y # CONFIG_HPET_MMAP is not set # CONFIG_HANGCHECK_TIMER is not set CONFIG_TCG_TPM=y CONFIG_TCG_TIS_CORE=y CONFIG_TCG_TIS=y # CONFIG_TCG_NSC is not set # CONFIG_TCG_ATMEL is not set # CONFIG_TCG_INFINEON is not set CONFIG_TCG_CRB=y # CONFIG_TCG_VTPM_PROXY is not set # CONFIG_TELCLOCK is not set # CONFIG_XILLYBUS is not set # end of Character devices # CONFIG_RANDOM_TRUST_CPU is not set # CONFIG_RANDOM_TRUST_BOOTLOADER is not set # # I2C support # # CONFIG_I2C is not set # end of I2C support # CONFIG_I3C is not set # CONFIG_SPI is not set # CONFIG_SPMI is not set # CONFIG_HSI is not set CONFIG_PPS=y # CONFIG_PPS_DEBUG is not set # # PPS clients support # # CONFIG_PPS_CLIENT_KTIMER is not set # CONFIG_PPS_CLIENT_LDISC is not set # CONFIG_PPS_CLIENT_GPIO is not set # # PPS generators support # # # PTP clock support # CONFIG_PTP_1588_CLOCK=y # # Enable PHYLIB and NETWORK_PHY_TIMESTAMPING to see the additional clocks. # # end of PTP clock support # CONFIG_PINCTRL is not set # CONFIG_GPIOLIB is not set # CONFIG_W1 is not set # CONFIG_POWER_AVS is not set # CONFIG_POWER_RESET is not set CONFIG_POWER_SUPPLY=y # CONFIG_POWER_SUPPLY_DEBUG is not set # CONFIG_PDA_POWER is not set # CONFIG_TEST_POWER is not set # CONFIG_BATTERY_DS2780 is not set # CONFIG_BATTERY_DS2781 is not set # CONFIG_BATTERY_BQ27XXX is not set # CONFIG_CHARGER_MAX8903 is not set # CONFIG_HWMON is not set CONFIG_THERMAL=y # CONFIG_THERMAL_NETLINK is not set # CONFIG_THERMAL_STATISTICS is not set CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=0 CONFIG_THERMAL_WRITABLE_TRIPS=y CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y # CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set # CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set # CONFIG_THERMAL_GOV_FAIR_SHARE is not set CONFIG_THERMAL_GOV_STEP_WISE=y # CONFIG_THERMAL_GOV_BANG_BANG is not set CONFIG_THERMAL_GOV_USER_SPACE=y # CONFIG_THERMAL_EMULATION is not set # # Intel thermal drivers # CONFIG_INTEL_POWERCLAMP=y CONFIG_X86_PKG_TEMP_THERMAL=m # CONFIG_INTEL_SOC_DTS_THERMAL is not set # # ACPI INT340X thermal drivers # # CONFIG_INT340X_THERMAL is not set # end of ACPI INT340X thermal drivers # CONFIG_INTEL_PCH_THERMAL is not set # end of Intel thermal drivers # CONFIG_WATCHDOG is not set CONFIG_SSB_POSSIBLE=y # CONFIG_SSB is not set CONFIG_BCMA_POSSIBLE=y # CONFIG_BCMA is not set # # Multifunction device drivers # # CONFIG_MFD_MADERA is not set # CONFIG_HTC_PASIC3 is not set # CONFIG_MFD_INTEL_QUARK_I2C_GPIO is not set # CONFIG_LPC_ICH is not set # CONFIG_LPC_SCH is not set # CONFIG_MFD_INTEL_LPSS_ACPI is not set # CONFIG_MFD_INTEL_LPSS_PCI is not set # CONFIG_MFD_JANZ_CMODIO is not set # CONFIG_MFD_KEMPLD is not set # CONFIG_MFD_MT6397 is not set # CONFIG_MFD_RDC321X is not set # CONFIG_MFD_SM501 is not set # CONFIG_ABX500_CORE is not set # CONFIG_MFD_SYSCON is not set # CONFIG_MFD_TI_AM335X_TSCADC is not set # CONFIG_MFD_TQMX86 is not set # CONFIG_MFD_VX855 is not set # end of Multifunction device drivers # CONFIG_REGULATOR is not set # CONFIG_RC_CORE is not set # CONFIG_MEDIA_CEC_SUPPORT is not set # CONFIG_MEDIA_SUPPORT is not set # # Graphics support # CONFIG_AGP=y CONFIG_AGP_AMD64=y CONFIG_AGP_INTEL=y CONFIG_AGP_SIS=y CONFIG_AGP_VIA=y CONFIG_INTEL_GTT=y CONFIG_VGA_ARB=y CONFIG_VGA_ARB_MAX_GPUS=16 # CONFIG_VGA_SWITCHEROO is not set # CONFIG_DRM is not set # # ARM devices # # end of ARM devices # # Frame buffer Devices # CONFIG_FB_CMDLINE=y CONFIG_FB_NOTIFY=y CONFIG_FB=y # CONFIG_FIRMWARE_EDID is not set CONFIG_FB_BOOT_VESA_SUPPORT=y CONFIG_FB_CFB_FILLRECT=y CONFIG_FB_CFB_COPYAREA=y CONFIG_FB_CFB_IMAGEBLIT=y # CONFIG_FB_FOREIGN_ENDIAN is not set CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y # # Frame buffer hardware drivers # # CONFIG_FB_CIRRUS is not set # CONFIG_FB_PM2 is not set # CONFIG_FB_CYBER2000 is not set # CONFIG_FB_ARC is not set # CONFIG_FB_ASILIANT is not set # CONFIG_FB_IMSTT is not set # CONFIG_FB_VGA16 is not set CONFIG_FB_VESA=y # CONFIG_FB_EFI is not set # CONFIG_FB_N411 is not set # CONFIG_FB_HGA is not set # CONFIG_FB_OPENCORES is not set # CONFIG_FB_S1D13XXX is not set # CONFIG_FB_NVIDIA is not set # CONFIG_FB_RIVA is not set # CONFIG_FB_I740 is not set # CONFIG_FB_LE80578 is not set # CONFIG_FB_INTEL is not set # CONFIG_FB_MATROX is not set # CONFIG_FB_RADEON is not set # CONFIG_FB_ATY128 is not set # CONFIG_FB_ATY is not set # CONFIG_FB_S3 is not set # CONFIG_FB_SAVAGE is not set # CONFIG_FB_SIS is not set # CONFIG_FB_NEOMAGIC is not set # CONFIG_FB_KYRO is not set # CONFIG_FB_3DFX is not set # CONFIG_FB_VOODOO1 is not set # CONFIG_FB_VT8623 is not set # CONFIG_FB_TRIDENT is not set # CONFIG_FB_ARK is not set # CONFIG_FB_PM3 is not set # CONFIG_FB_CARMINE is not set # CONFIG_FB_IBM_GXT4500 is not set # CONFIG_FB_VIRTUAL is not set # CONFIG_FB_METRONOME is not set # CONFIG_FB_MB862XX is not set # CONFIG_FB_SIMPLE is not set # CONFIG_FB_SM712 is not set # end of Frame buffer Devices # # Backlight & LCD device support # # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_BACKLIGHT_CLASS_DEVICE=y # CONFIG_BACKLIGHT_APPLE is not set # CONFIG_BACKLIGHT_QCOM_WLED is not set # CONFIG_BACKLIGHT_SAHARA is not set # end of Backlight & LCD device support # # Console display driver support # CONFIG_VGA_CONSOLE=y CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 # CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT is not set CONFIG_DUMMY_CONSOLE=y CONFIG_DUMMY_CONSOLE_COLUMNS=80 CONFIG_DUMMY_CONSOLE_ROWS=25 CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y # CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER is not set # end of Console display driver support CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_LOGO_LINUX_VGA16 is not set CONFIG_LOGO_LINUX_CLUT224=y # end of Graphics support # CONFIG_SOUND is not set # # HID support # CONFIG_HID=y # CONFIG_HID_BATTERY_STRENGTH is not set # CONFIG_HIDRAW is not set # CONFIG_UHID is not set CONFIG_HID_GENERIC=y # # Special HID drivers # CONFIG_HID_A4TECH=y # CONFIG_HID_ACRUX is not set CONFIG_HID_APPLE=y # CONFIG_HID_AUREAL is not set CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y # CONFIG_HID_COUGAR is not set # CONFIG_HID_MACALLY is not set # CONFIG_HID_CMEDIA is not set CONFIG_HID_CYPRESS=y CONFIG_HID_DRAGONRISE=y # CONFIG_DRAGONRISE_FF is not set # CONFIG_HID_EMS_FF is not set # CONFIG_HID_ELECOM is not set CONFIG_HID_EZKEY=y # CONFIG_HID_GEMBIRD is not set # CONFIG_HID_GFRM is not set # CONFIG_HID_GLORIOUS is not set # CONFIG_HID_KEYTOUCH is not set CONFIG_HID_KYE=y # CONFIG_HID_WALTOP is not set # CONFIG_HID_VIEWSONIC is not set CONFIG_HID_GYRATION=y # CONFIG_HID_ICADE is not set # CONFIG_HID_ITE is not set # CONFIG_HID_JABRA is not set CONFIG_HID_TWINHAN=y CONFIG_HID_KENSINGTON=y # CONFIG_HID_LCPOWER is not set # CONFIG_HID_LENOVO is not set # CONFIG_HID_MAGICMOUSE is not set # CONFIG_HID_MALTRON is not set # CONFIG_HID_MAYFLASH is not set # CONFIG_HID_REDRAGON is not set CONFIG_HID_MICROSOFT=y CONFIG_HID_MONTEREY=y # CONFIG_HID_MULTITOUCH is not set # CONFIG_HID_NTI is not set # CONFIG_HID_ORTEK is not set CONFIG_HID_PANTHERLORD=y # CONFIG_PANTHERLORD_FF is not set CONFIG_HID_PETALYNX=y # CONFIG_HID_PICOLCD is not set # CONFIG_HID_PLANTRONICS is not set # CONFIG_HID_PRIMAX is not set # CONFIG_HID_SAITEK is not set CONFIG_HID_SAMSUNG=y # CONFIG_HID_SPEEDLINK is not set # CONFIG_HID_STEAM is not set # CONFIG_HID_STEELSERIES is not set CONFIG_HID_SUNPLUS=y # CONFIG_HID_RMI is not set CONFIG_HID_GREENASIA=y # CONFIG_GREENASIA_FF is not set CONFIG_HID_SMARTJOYPLUS=y # CONFIG_SMARTJOYPLUS_FF is not set # CONFIG_HID_TIVO is not set CONFIG_HID_TOPSEED=y CONFIG_HID_THRUSTMASTER=y CONFIG_THRUSTMASTER_FF=y # CONFIG_HID_UDRAW_PS3 is not set # CONFIG_HID_XINMO is not set CONFIG_HID_ZEROPLUS=y CONFIG_ZEROPLUS_FF=y # CONFIG_HID_ZYDACRON is not set # CONFIG_HID_SENSOR_HUB is not set # CONFIG_HID_ALPS is not set # end of Special HID drivers # # Intel ISH HID support # # CONFIG_INTEL_ISH_HID is not set # end of Intel ISH HID support # end of HID support CONFIG_USB_OHCI_LITTLE_ENDIAN=y # CONFIG_USB_SUPPORT is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set # CONFIG_NEW_LEDS is not set # CONFIG_ACCESSIBILITY is not set # CONFIG_INFINIBAND is not set CONFIG_EDAC_ATOMIC_SCRUB=y CONFIG_EDAC_SUPPORT=y # CONFIG_EDAC is not set CONFIG_RTC_LIB=y CONFIG_RTC_MC146818_LIB=y # CONFIG_RTC_CLASS is not set # CONFIG_DMADEVICES is not set # # DMABUF options # CONFIG_SYNC_FILE=y # CONFIG_SW_SYNC is not set # CONFIG_UDMABUF is not set # CONFIG_DMABUF_MOVE_NOTIFY is not set # CONFIG_DMABUF_SELFTESTS is not set # CONFIG_DMABUF_HEAPS is not set # end of DMABUF options # CONFIG_AUXDISPLAY is not set # CONFIG_UIO is not set CONFIG_VIRT_DRIVERS=y # CONFIG_VBOXGUEST is not set CONFIG_VIRTIO=y CONFIG_VIRTIO_MENU=y CONFIG_VIRTIO_PCI=y CONFIG_VIRTIO_PCI_LEGACY=y CONFIG_VIRTIO_BALLOON=y # CONFIG_VIRTIO_INPUT is not set # CONFIG_VIRTIO_MMIO is not set # CONFIG_VDPA is not set CONFIG_VHOST_MENU=y # CONFIG_VHOST_NET is not set # CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set # # Microsoft Hyper-V guest support # # end of Microsoft Hyper-V guest support # CONFIG_GREYBUS is not set # CONFIG_STAGING is not set # CONFIG_X86_PLATFORM_DEVICES is not set CONFIG_PMC_ATOM=y # CONFIG_MFD_CROS_EC is not set # CONFIG_CHROME_PLATFORMS is not set # CONFIG_MELLANOX_PLATFORM is not set CONFIG_HAVE_CLK=y CONFIG_CLKDEV_LOOKUP=y CONFIG_HAVE_CLK_PREPARE=y CONFIG_COMMON_CLK=y # CONFIG_HWSPINLOCK is not set # # Clock Source drivers # CONFIG_CLKEVT_I8253=y CONFIG_I8253_LOCK=y CONFIG_CLKBLD_I8253=y # end of Clock Source drivers CONFIG_MAILBOX=y CONFIG_PCC=y # CONFIG_ALTERA_MBOX is not set # CONFIG_IOMMU_SUPPORT is not set # # Remoteproc drivers # # CONFIG_REMOTEPROC is not set # end of Remoteproc drivers # # Rpmsg drivers # # CONFIG_RPMSG_QCOM_GLINK_RPM is not set # CONFIG_RPMSG_VIRTIO is not set # end of Rpmsg drivers # CONFIG_SOUNDWIRE is not set # # SOC (System On Chip) specific Drivers # # # Amlogic SoC drivers # # end of Amlogic SoC drivers # # Aspeed SoC drivers # # end of Aspeed SoC drivers # # Broadcom SoC drivers # # end of Broadcom SoC drivers # # NXP/Freescale QorIQ SoC drivers # # end of NXP/Freescale QorIQ SoC drivers # # i.MX SoC drivers # # end of i.MX SoC drivers # # Qualcomm SoC drivers # # end of Qualcomm SoC drivers # CONFIG_SOC_TI is not set # # Xilinx SoC drivers # # CONFIG_XILINX_VCU is not set # end of Xilinx SoC drivers # end of SOC (System On Chip) specific Drivers # CONFIG_PM_DEVFREQ is not set # CONFIG_EXTCON is not set # CONFIG_MEMORY is not set # CONFIG_IIO is not set # CONFIG_NTB is not set # CONFIG_VME_BUS is not set # CONFIG_PWM is not set # # IRQ chip support # # end of IRQ chip support # CONFIG_IPACK_BUS is not set # CONFIG_RESET_CONTROLLER is not set # # PHY Subsystem # CONFIG_GENERIC_PHY=y # CONFIG_BCM_KONA_USB2_PHY is not set # CONFIG_PHY_PXA_28NM_HSIC is not set # CONFIG_PHY_PXA_28NM_USB2 is not set # CONFIG_PHY_INTEL_EMMC is not set # end of PHY Subsystem # CONFIG_POWERCAP is not set # CONFIG_MCB is not set # # Performance monitor support # # end of Performance monitor support CONFIG_RAS=y # CONFIG_RAS_CEC is not set # CONFIG_USB4 is not set # # Android # # CONFIG_ANDROID is not set # end of Android # CONFIG_LIBNVDIMM is not set # CONFIG_DAX is not set CONFIG_NVMEM=y # CONFIG_NVMEM_SYSFS is not set # # HW tracing support # # CONFIG_STM is not set # CONFIG_INTEL_TH is not set # end of HW tracing support # CONFIG_FPGA is not set # CONFIG_TEE is not set # CONFIG_UNISYS_VISORBUS is not set # CONFIG_SIOX is not set # CONFIG_SLIMBUS is not set # CONFIG_INTERCONNECT is not set # CONFIG_COUNTER is not set # end of Device Drivers # # File systems # CONFIG_DCACHE_WORD_ACCESS=y CONFIG_VALIDATE_FS_PARSER=y CONFIG_FS_IOMAP=y # CONFIG_EXT2_FS is not set # CONFIG_EXT3_FS is not set CONFIG_EXT4_FS=y CONFIG_EXT4_USE_FOR_EXT2=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y # CONFIG_EXT4_DEBUG is not set CONFIG_JBD2=y # CONFIG_JBD2_DEBUG is not set CONFIG_FS_MBCACHE=y # CONFIG_REISERFS_FS is not set # CONFIG_JFS_FS is not set # CONFIG_XFS_FS is not set # CONFIG_GFS2_FS is not set # CONFIG_BTRFS_FS is not set # CONFIG_NILFS2_FS is not set # CONFIG_F2FS_FS is not set # CONFIG_FS_DAX is not set CONFIG_FS_POSIX_ACL=y CONFIG_EXPORTFS=y # CONFIG_EXPORTFS_BLOCK_OPS is not set CONFIG_FILE_LOCKING=y CONFIG_MANDATORY_FILE_LOCKING=y # CONFIG_FS_ENCRYPTION is not set # CONFIG_FS_VERITY is not set CONFIG_FSNOTIFY=y CONFIG_DNOTIFY=y CONFIG_INOTIFY_USER=y # CONFIG_FANOTIFY is not set # CONFIG_QUOTA is not set # CONFIG_AUTOFS4_FS is not set # CONFIG_AUTOFS_FS is not set # CONFIG_FUSE_FS is not set # CONFIG_OVERLAY_FS is not set # # Caches # # CONFIG_FSCACHE is not set # end of Caches # # CD-ROM/DVD Filesystems # # CONFIG_ISO9660_FS is not set # CONFIG_UDF_FS is not set # end of CD-ROM/DVD Filesystems # # DOS/FAT/EXFAT/NT Filesystems # # CONFIG_MSDOS_FS is not set # CONFIG_VFAT_FS is not set # CONFIG_EXFAT_FS is not set # CONFIG_NTFS_FS is not set # end of DOS/FAT/EXFAT/NT Filesystems # # Pseudo filesystems # CONFIG_PROC_FS=y CONFIG_PROC_KCORE=y CONFIG_PROC_SYSCTL=y CONFIG_PROC_PAGE_MONITOR=y # CONFIG_PROC_CHILDREN is not set CONFIG_PROC_PID_ARCH_STATUS=y CONFIG_KERNFS=y CONFIG_SYSFS=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS_XATTR=y # CONFIG_TMPFS_INODE64 is not set CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y CONFIG_MEMFD_CREATE=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y # CONFIG_CONFIGFS_FS is not set # CONFIG_EFIVAR_FS is not set # end of Pseudo filesystems # CONFIG_MISC_FILESYSTEMS is not set CONFIG_NETWORK_FILESYSTEMS=y # CONFIG_NFS_FS is not set # CONFIG_NFSD is not set # CONFIG_CEPH_FS is not set # CONFIG_CIFS is not set # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set CONFIG_9P_FS=y CONFIG_9P_FS_POSIX_ACL=y CONFIG_9P_FS_SECURITY=y CONFIG_NLS=y CONFIG_NLS_DEFAULT="utf8" CONFIG_NLS_CODEPAGE_437=y # CONFIG_NLS_CODEPAGE_737 is not set # CONFIG_NLS_CODEPAGE_775 is not set # CONFIG_NLS_CODEPAGE_850 is not set # CONFIG_NLS_CODEPAGE_852 is not set # CONFIG_NLS_CODEPAGE_855 is not set # CONFIG_NLS_CODEPAGE_857 is not set # CONFIG_NLS_CODEPAGE_860 is not set # CONFIG_NLS_CODEPAGE_861 is not set # CONFIG_NLS_CODEPAGE_862 is not set # CONFIG_NLS_CODEPAGE_863 is not set # CONFIG_NLS_CODEPAGE_864 is not set # CONFIG_NLS_CODEPAGE_865 is not set # CONFIG_NLS_CODEPAGE_866 is not set # CONFIG_NLS_CODEPAGE_869 is not set # CONFIG_NLS_CODEPAGE_936 is not set # CONFIG_NLS_CODEPAGE_950 is not set # CONFIG_NLS_CODEPAGE_932 is not set # CONFIG_NLS_CODEPAGE_949 is not set # CONFIG_NLS_CODEPAGE_874 is not set # CONFIG_NLS_ISO8859_8 is not set # CONFIG_NLS_CODEPAGE_1250 is not set # CONFIG_NLS_CODEPAGE_1251 is not set CONFIG_NLS_ASCII=y # CONFIG_NLS_ISO8859_1 is not set # CONFIG_NLS_ISO8859_2 is not set # CONFIG_NLS_ISO8859_3 is not set # CONFIG_NLS_ISO8859_4 is not set # CONFIG_NLS_ISO8859_5 is not set # CONFIG_NLS_ISO8859_6 is not set # CONFIG_NLS_ISO8859_7 is not set # CONFIG_NLS_ISO8859_9 is not set # CONFIG_NLS_ISO8859_13 is not set # CONFIG_NLS_ISO8859_14 is not set # CONFIG_NLS_ISO8859_15 is not set # CONFIG_NLS_KOI8_R is not set # CONFIG_NLS_KOI8_U is not set # CONFIG_NLS_MAC_ROMAN is not set # CONFIG_NLS_MAC_CELTIC is not set # CONFIG_NLS_MAC_CENTEURO is not set # CONFIG_NLS_MAC_CROATIAN is not set # CONFIG_NLS_MAC_CYRILLIC is not set # CONFIG_NLS_MAC_GAELIC is not set # CONFIG_NLS_MAC_GREEK is not set # CONFIG_NLS_MAC_ICELAND is not set # CONFIG_NLS_MAC_INUIT is not set # CONFIG_NLS_MAC_ROMANIAN is not set # CONFIG_NLS_MAC_TURKISH is not set # CONFIG_NLS_UTF8 is not set # CONFIG_UNICODE is not set CONFIG_IO_WQ=y # end of File systems # # Security options # CONFIG_KEYS=y # CONFIG_KEYS_REQUEST_CACHE is not set # CONFIG_PERSISTENT_KEYRINGS is not set # CONFIG_TRUSTED_KEYS is not set # CONFIG_ENCRYPTED_KEYS is not set # CONFIG_KEY_DH_OPERATIONS is not set # CONFIG_SECURITY_DMESG_RESTRICT is not set CONFIG_SECURITY=y CONFIG_SECURITYFS=y CONFIG_SECURITY_NETWORK=y CONFIG_PAGE_TABLE_ISOLATION=y # CONFIG_SECURITY_NETWORK_XFRM is not set # CONFIG_SECURITY_PATH is not set CONFIG_LSM_MMAP_MIN_ADDR=65536 CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y # CONFIG_HARDENED_USERCOPY is not set # CONFIG_FORTIFY_SOURCE is not set # CONFIG_STATIC_USERMODEHELPER is not set CONFIG_SECURITY_SELINUX=y # CONFIG_SECURITY_SELINUX_BOOTPARAM is not set # CONFIG_SECURITY_SELINUX_DISABLE is not set CONFIG_SECURITY_SELINUX_DEVELOP=y CONFIG_SECURITY_SELINUX_AVC_STATS=y CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0 CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS=9 CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE=256 # CONFIG_SECURITY_SMACK is not set # CONFIG_SECURITY_TOMOYO is not set # CONFIG_SECURITY_APPARMOR is not set # CONFIG_SECURITY_LOADPIN is not set # CONFIG_SECURITY_YAMA is not set # CONFIG_SECURITY_SAFESETID is not set # CONFIG_SECURITY_LOCKDOWN_LSM is not set CONFIG_INTEGRITY=y # CONFIG_INTEGRITY_SIGNATURE is not set CONFIG_INTEGRITY_AUDIT=y CONFIG_IMA=y CONFIG_IMA_MEASURE_PCR_IDX=10 CONFIG_IMA_LSM_RULES=y # CONFIG_IMA_TEMPLATE is not set CONFIG_IMA_NG_TEMPLATE=y # CONFIG_IMA_SIG_TEMPLATE is not set CONFIG_IMA_DEFAULT_TEMPLATE="ima-ng" CONFIG_IMA_DEFAULT_HASH_SHA1=y # CONFIG_IMA_DEFAULT_HASH_SHA256 is not set CONFIG_IMA_DEFAULT_HASH="sha1" CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_READ_POLICY=y # CONFIG_IMA_APPRAISE is not set CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS=y CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS=y # CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT is not set # CONFIG_EVM is not set # CONFIG_DEFAULT_SECURITY_SELINUX is not set CONFIG_DEFAULT_SECURITY_DAC=y CONFIG_LSM="selinux,bpf,integrity" # # Kernel hardening options # # # Memory initialization # CONFIG_INIT_STACK_NONE=y # CONFIG_INIT_ON_ALLOC_DEFAULT_ON is not set # CONFIG_INIT_ON_FREE_DEFAULT_ON is not set # end of Memory initialization # end of Kernel hardening options # end of Security options CONFIG_CRYPTO=y # # Crypto core or helper # CONFIG_CRYPTO_ALGAPI=y CONFIG_CRYPTO_ALGAPI2=y CONFIG_CRYPTO_AEAD=y CONFIG_CRYPTO_AEAD2=y CONFIG_CRYPTO_SKCIPHER=y CONFIG_CRYPTO_SKCIPHER2=y CONFIG_CRYPTO_HASH=y CONFIG_CRYPTO_HASH2=y CONFIG_CRYPTO_RNG=y CONFIG_CRYPTO_RNG2=y CONFIG_CRYPTO_RNG_DEFAULT=y CONFIG_CRYPTO_AKCIPHER2=y CONFIG_CRYPTO_AKCIPHER=y CONFIG_CRYPTO_KPP2=y CONFIG_CRYPTO_ACOMP2=y CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_MANAGER2=y # CONFIG_CRYPTO_USER is not set CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y CONFIG_CRYPTO_GF128MUL=y CONFIG_CRYPTO_NULL=y CONFIG_CRYPTO_NULL2=y # CONFIG_CRYPTO_PCRYPT is not set # CONFIG_CRYPTO_CRYPTD is not set # CONFIG_CRYPTO_AUTHENC is not set # CONFIG_CRYPTO_TEST is not set CONFIG_CRYPTO_ENGINE=m # # Public-key cryptography # CONFIG_CRYPTO_RSA=y # CONFIG_CRYPTO_DH is not set # CONFIG_CRYPTO_ECDH is not set # CONFIG_CRYPTO_ECRDSA is not set # CONFIG_CRYPTO_CURVE25519 is not set # CONFIG_CRYPTO_CURVE25519_X86 is not set # # Authenticated Encryption with Associated Data # # CONFIG_CRYPTO_CCM is not set CONFIG_CRYPTO_GCM=y # CONFIG_CRYPTO_CHACHA20POLY1305 is not set # CONFIG_CRYPTO_AEGIS128 is not set # CONFIG_CRYPTO_AEGIS128_AESNI_SSE2 is not set CONFIG_CRYPTO_SEQIV=y # CONFIG_CRYPTO_ECHAINIV is not set # # Block modes # # CONFIG_CRYPTO_CBC is not set # CONFIG_CRYPTO_CFB is not set CONFIG_CRYPTO_CTR=y # CONFIG_CRYPTO_CTS is not set # CONFIG_CRYPTO_ECB is not set # CONFIG_CRYPTO_LRW is not set # CONFIG_CRYPTO_OFB is not set # CONFIG_CRYPTO_PCBC is not set # CONFIG_CRYPTO_XTS is not set # CONFIG_CRYPTO_KEYWRAP is not set # CONFIG_CRYPTO_NHPOLY1305_SSE2 is not set # CONFIG_CRYPTO_NHPOLY1305_AVX2 is not set # CONFIG_CRYPTO_ADIANTUM is not set # CONFIG_CRYPTO_ESSIV is not set # # Hash modes # # CONFIG_CRYPTO_CMAC is not set CONFIG_CRYPTO_HMAC=y # CONFIG_CRYPTO_XCBC is not set # CONFIG_CRYPTO_VMAC is not set # # Digest # CONFIG_CRYPTO_CRC32C=y # CONFIG_CRYPTO_CRC32C_INTEL is not set # CONFIG_CRYPTO_CRC32 is not set # CONFIG_CRYPTO_CRC32_PCLMUL is not set CONFIG_CRYPTO_XXHASH=y CONFIG_CRYPTO_BLAKE2B=y # CONFIG_CRYPTO_BLAKE2S is not set # CONFIG_CRYPTO_BLAKE2S_X86 is not set CONFIG_CRYPTO_CRCT10DIF=y # CONFIG_CRYPTO_CRCT10DIF_PCLMUL is not set CONFIG_CRYPTO_GHASH=y # CONFIG_CRYPTO_POLY1305 is not set # CONFIG_CRYPTO_POLY1305_X86_64 is not set # CONFIG_CRYPTO_MD4 is not set CONFIG_CRYPTO_MD5=y # CONFIG_CRYPTO_MICHAEL_MIC is not set # CONFIG_CRYPTO_RMD128 is not set # CONFIG_CRYPTO_RMD160 is not set # CONFIG_CRYPTO_RMD256 is not set # CONFIG_CRYPTO_RMD320 is not set CONFIG_CRYPTO_SHA1=y # CONFIG_CRYPTO_SHA1_SSSE3 is not set # CONFIG_CRYPTO_SHA256_SSSE3 is not set # CONFIG_CRYPTO_SHA512_SSSE3 is not set CONFIG_CRYPTO_SHA256=y # CONFIG_CRYPTO_SHA512 is not set # CONFIG_CRYPTO_SHA3 is not set # CONFIG_CRYPTO_SM3 is not set # CONFIG_CRYPTO_STREEBOG is not set # CONFIG_CRYPTO_TGR192 is not set # CONFIG_CRYPTO_WP512 is not set # CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL is not set # # Ciphers # CONFIG_CRYPTO_AES=y # CONFIG_CRYPTO_AES_TI is not set # CONFIG_CRYPTO_AES_NI_INTEL is not set # CONFIG_CRYPTO_ANUBIS is not set # CONFIG_CRYPTO_ARC4 is not set # CONFIG_CRYPTO_BLOWFISH is not set # CONFIG_CRYPTO_BLOWFISH_X86_64 is not set # CONFIG_CRYPTO_CAMELLIA is not set # CONFIG_CRYPTO_CAMELLIA_X86_64 is not set # CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64 is not set # CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64 is not set # CONFIG_CRYPTO_CAST5 is not set # CONFIG_CRYPTO_CAST5_AVX_X86_64 is not set # CONFIG_CRYPTO_CAST6 is not set # CONFIG_CRYPTO_CAST6_AVX_X86_64 is not set # CONFIG_CRYPTO_DES is not set # CONFIG_CRYPTO_DES3_EDE_X86_64 is not set # CONFIG_CRYPTO_FCRYPT is not set # CONFIG_CRYPTO_KHAZAD is not set # CONFIG_CRYPTO_SALSA20 is not set # CONFIG_CRYPTO_CHACHA20 is not set # CONFIG_CRYPTO_CHACHA20_X86_64 is not set # CONFIG_CRYPTO_SEED is not set # CONFIG_CRYPTO_SERPENT is not set # CONFIG_CRYPTO_SERPENT_SSE2_X86_64 is not set # CONFIG_CRYPTO_SERPENT_AVX_X86_64 is not set # CONFIG_CRYPTO_SERPENT_AVX2_X86_64 is not set # CONFIG_CRYPTO_SM4 is not set # CONFIG_CRYPTO_TEA is not set # CONFIG_CRYPTO_TWOFISH is not set # CONFIG_CRYPTO_TWOFISH_X86_64 is not set # CONFIG_CRYPTO_TWOFISH_X86_64_3WAY is not set # CONFIG_CRYPTO_TWOFISH_AVX_X86_64 is not set # # Compression # # CONFIG_CRYPTO_DEFLATE is not set # CONFIG_CRYPTO_LZO is not set # CONFIG_CRYPTO_842 is not set # CONFIG_CRYPTO_LZ4 is not set # CONFIG_CRYPTO_LZ4HC is not set # CONFIG_CRYPTO_ZSTD is not set # # Random Number Generation # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_DRBG_MENU=y CONFIG_CRYPTO_DRBG_HMAC=y # CONFIG_CRYPTO_DRBG_HASH is not set # CONFIG_CRYPTO_DRBG_CTR is not set CONFIG_CRYPTO_DRBG=y CONFIG_CRYPTO_JITTERENTROPY=y CONFIG_CRYPTO_USER_API=y CONFIG_CRYPTO_USER_API_HASH=y # CONFIG_CRYPTO_USER_API_SKCIPHER is not set # CONFIG_CRYPTO_USER_API_RNG is not set # CONFIG_CRYPTO_USER_API_AEAD is not set CONFIG_CRYPTO_HASH_INFO=y # # Crypto library routines # CONFIG_CRYPTO_LIB_AES=y # CONFIG_CRYPTO_LIB_BLAKE2S is not set # CONFIG_CRYPTO_LIB_CHACHA is not set # CONFIG_CRYPTO_LIB_CURVE25519 is not set CONFIG_CRYPTO_LIB_POLY1305_RSIZE=11 # CONFIG_CRYPTO_LIB_POLY1305 is not set # CONFIG_CRYPTO_LIB_CHACHA20POLY1305 is not set CONFIG_CRYPTO_LIB_SHA256=y CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_PADLOCK is not set # CONFIG_CRYPTO_DEV_CCP is not set # CONFIG_CRYPTO_DEV_QAT_DH895xCC is not set # CONFIG_CRYPTO_DEV_QAT_C3XXX is not set # CONFIG_CRYPTO_DEV_QAT_C62X is not set # CONFIG_CRYPTO_DEV_QAT_DH895xCCVF is not set # CONFIG_CRYPTO_DEV_QAT_C3XXXVF is not set # CONFIG_CRYPTO_DEV_QAT_C62XVF is not set # CONFIG_CRYPTO_DEV_NITROX_CNN55XX is not set CONFIG_CRYPTO_DEV_VIRTIO=m # CONFIG_CRYPTO_DEV_SAFEXCEL is not set # CONFIG_CRYPTO_DEV_AMLOGIC_GXL is not set CONFIG_ASYMMETRIC_KEY_TYPE=y CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y CONFIG_X509_CERTIFICATE_PARSER=y # CONFIG_PKCS8_PRIVATE_KEY_PARSER is not set CONFIG_PKCS7_MESSAGE_PARSER=y # # Certificates for signature checking # CONFIG_SYSTEM_TRUSTED_KEYRING=y CONFIG_SYSTEM_TRUSTED_KEYS="" # CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set # CONFIG_SECONDARY_TRUSTED_KEYRING is not set # CONFIG_SYSTEM_BLACKLIST_KEYRING is not set # end of Certificates for signature checking CONFIG_BINARY_PRINTF=y # # Library routines # # CONFIG_PACKING is not set CONFIG_BITREVERSE=y CONFIG_GENERIC_STRNCPY_FROM_USER=y CONFIG_GENERIC_STRNLEN_USER=y CONFIG_GENERIC_NET_UTILS=y CONFIG_GENERIC_FIND_FIRST_BIT=y # CONFIG_CORDIC is not set # CONFIG_PRIME_NUMBERS is not set CONFIG_RATIONAL=y CONFIG_GENERIC_PCI_IOMAP=y CONFIG_GENERIC_IOMAP=y CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y CONFIG_ARCH_HAS_FAST_MULTIPLIER=y CONFIG_ARCH_USE_SYM_ANNOTATIONS=y CONFIG_CRC_CCITT=y CONFIG_CRC16=y CONFIG_CRC_T10DIF=y # CONFIG_CRC_ITU_T is not set CONFIG_CRC32=y # CONFIG_CRC32_SELFTEST is not set CONFIG_CRC32_SLICEBY8=y # CONFIG_CRC32_SLICEBY4 is not set # CONFIG_CRC32_SARWATE is not set # CONFIG_CRC32_BIT is not set # CONFIG_CRC64 is not set # CONFIG_CRC4 is not set # CONFIG_CRC7 is not set CONFIG_LIBCRC32C=y # CONFIG_CRC8 is not set CONFIG_XXHASH=y # CONFIG_RANDOM32_SELFTEST is not set CONFIG_ZLIB_INFLATE=y CONFIG_LZO_DECOMPRESS=y CONFIG_LZ4_DECOMPRESS=y CONFIG_ZSTD_DECOMPRESS=y CONFIG_XZ_DEC=y CONFIG_XZ_DEC_X86=y # CONFIG_XZ_DEC_POWERPC is not set # CONFIG_XZ_DEC_IA64 is not set # CONFIG_XZ_DEC_ARM is not set # CONFIG_XZ_DEC_ARMTHUMB is not set # CONFIG_XZ_DEC_SPARC is not set CONFIG_XZ_DEC_BCJ=y # CONFIG_XZ_DEC_TEST is not set CONFIG_DECOMPRESS_GZIP=y CONFIG_DECOMPRESS_BZIP2=y CONFIG_DECOMPRESS_LZMA=y CONFIG_DECOMPRESS_XZ=y CONFIG_DECOMPRESS_LZO=y CONFIG_DECOMPRESS_LZ4=y CONFIG_DECOMPRESS_ZSTD=y CONFIG_GENERIC_ALLOCATOR=y CONFIG_XARRAY_MULTI=y CONFIG_ASSOCIATIVE_ARRAY=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT_MAP=y CONFIG_HAS_DMA=y CONFIG_DMA_OPS=y CONFIG_NEED_SG_DMA_LENGTH=y CONFIG_NEED_DMA_MAP_STATE=y CONFIG_ARCH_DMA_ADDR_T_64BIT=y CONFIG_SWIOTLB=y CONFIG_DMA_CMA=y # # Default contiguous memory area size: # CONFIG_CMA_SIZE_MBYTES=0 CONFIG_CMA_SIZE_SEL_MBYTES=y # CONFIG_CMA_SIZE_SEL_PERCENTAGE is not set # CONFIG_CMA_SIZE_SEL_MIN is not set # CONFIG_CMA_SIZE_SEL_MAX is not set CONFIG_CMA_ALIGNMENT=8 # CONFIG_DMA_API_DEBUG is not set CONFIG_SGL_ALLOC=y CONFIG_IOMMU_HELPER=y CONFIG_CPU_RMAP=y CONFIG_DQL=y CONFIG_GLOB=y # CONFIG_GLOB_SELFTEST is not set CONFIG_NLATTR=y CONFIG_CLZ_TAB=y CONFIG_IRQ_POLL=y CONFIG_MPILIB=y CONFIG_OID_REGISTRY=y CONFIG_UCS2_STRING=y CONFIG_HAVE_GENERIC_VDSO=y CONFIG_GENERIC_GETTIMEOFDAY=y CONFIG_GENERIC_VDSO_TIME_NS=y CONFIG_FONT_SUPPORT=y CONFIG_FONTS=y # CONFIG_FONT_8x8 is not set CONFIG_FONT_8x16=y # CONFIG_FONT_6x11 is not set # CONFIG_FONT_7x14 is not set # CONFIG_FONT_PEARL_8x8 is not set # CONFIG_FONT_ACORN_8x8 is not set CONFIG_FONT_MINI_4x6=y # CONFIG_FONT_6x10 is not set # CONFIG_FONT_10x18 is not set # CONFIG_FONT_SUN8x16 is not set # CONFIG_FONT_SUN12x22 is not set # CONFIG_FONT_TER16x32 is not set CONFIG_ARCH_HAS_PMEM_API=y CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y CONFIG_ARCH_HAS_UACCESS_MCSAFE=y CONFIG_ARCH_STACKWALK=y CONFIG_SBITMAP=y # CONFIG_STRING_SELFTEST is not set # end of Library routines # # Kernel hacking # # # printk and dmesg options # CONFIG_PRINTK_TIME=y # CONFIG_PRINTK_CALLER is not set CONFIG_CONSOLE_LOGLEVEL_DEFAULT=7 CONFIG_CONSOLE_LOGLEVEL_QUIET=4 CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 # CONFIG_BOOT_PRINTK_DELAY is not set # CONFIG_DYNAMIC_DEBUG is not set # CONFIG_DYNAMIC_DEBUG_CORE is not set CONFIG_SYMBOLIC_ERRNAME=y CONFIG_DEBUG_BUGVERBOSE=y # end of printk and dmesg options # # Compile-time checks and compiler options # CONFIG_DEBUG_INFO=y # CONFIG_DEBUG_INFO_REDUCED is not set # CONFIG_DEBUG_INFO_COMPRESSED is not set # CONFIG_DEBUG_INFO_SPLIT is not set # CONFIG_DEBUG_INFO_DWARF4 is not set CONFIG_DEBUG_INFO_BTF=y # CONFIG_GDB_SCRIPTS is not set CONFIG_ENABLE_MUST_CHECK=y CONFIG_FRAME_WARN=2048 # CONFIG_STRIP_ASM_SYMS is not set # CONFIG_READABLE_ASM is not set # CONFIG_HEADERS_INSTALL is not set # CONFIG_DEBUG_SECTION_MISMATCH is not set CONFIG_SECTION_MISMATCH_WARN_ONLY=y # CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_32B is not set CONFIG_STACK_VALIDATION=y # CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set # end of Compile-time checks and compiler options # # Generic Kernel Debugging Instruments # CONFIG_MAGIC_SYSRQ=y CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x1 CONFIG_MAGIC_SYSRQ_SERIAL=y CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE="" CONFIG_DEBUG_FS=y CONFIG_DEBUG_FS_ALLOW_ALL=y # CONFIG_DEBUG_FS_DISALLOW_MOUNT is not set # CONFIG_DEBUG_FS_ALLOW_NONE is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y # CONFIG_UBSAN is not set # end of Generic Kernel Debugging Instruments CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_MISC=y # # Memory Debugging # # CONFIG_PAGE_EXTENSION is not set # CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_PAGE_OWNER is not set # CONFIG_PAGE_POISONING is not set # CONFIG_DEBUG_PAGE_REF is not set # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_ARCH_HAS_DEBUG_WX=y # CONFIG_DEBUG_WX is not set CONFIG_GENERIC_PTDUMP=y # CONFIG_PTDUMP_DEBUGFS is not set # CONFIG_DEBUG_OBJECTS is not set # CONFIG_SLUB_DEBUG_ON is not set # CONFIG_SLUB_STATS is not set CONFIG_HAVE_DEBUG_KMEMLEAK=y # CONFIG_DEBUG_KMEMLEAK is not set # CONFIG_DEBUG_STACK_USAGE is not set CONFIG_SCHED_STACK_END_CHECK=y CONFIG_ARCH_HAS_DEBUG_VM_PGTABLE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_VM_PGTABLE is not set CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y # CONFIG_DEBUG_VIRTUAL is not set CONFIG_DEBUG_MEMORY_INIT=y # CONFIG_DEBUG_PER_CPU_MAPS is not set CONFIG_HAVE_ARCH_KASAN=y CONFIG_HAVE_ARCH_KASAN_VMALLOC=y CONFIG_CC_HAS_KASAN_GENERIC=y # end of Memory Debugging # CONFIG_DEBUG_SHIRQ is not set # # Debug Oops, Lockups and Hangs # CONFIG_PANIC_ON_OOPS=y CONFIG_PANIC_ON_OOPS_VALUE=1 CONFIG_PANIC_TIMEOUT=0 CONFIG_LOCKUP_DETECTOR=y CONFIG_SOFTLOCKUP_DETECTOR=y # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 CONFIG_HARDLOCKUP_DETECTOR_PERF=y CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=1 CONFIG_DETECT_HUNG_TASK=y CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 # CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 # CONFIG_WQ_WATCHDOG is not set # CONFIG_TEST_LOCKUP is not set # end of Debug Oops, Lockups and Hangs # # Scheduler Debugging # CONFIG_SCHED_DEBUG=y CONFIG_SCHED_INFO=y CONFIG_SCHEDSTATS=y # end of Scheduler Debugging # CONFIG_DEBUG_TIMEKEEPING is not set CONFIG_DEBUG_PREEMPT=y # # Lock Debugging (spinlocks, mutexes, etc...) # CONFIG_LOCK_DEBUGGING_SUPPORT=y CONFIG_PROVE_LOCKING=y # CONFIG_PROVE_RAW_LOCK_NESTING is not set # CONFIG_LOCK_STAT is not set CONFIG_DEBUG_RT_MUTEXES=y CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_MUTEXES=y CONFIG_DEBUG_WW_MUTEX_SLOWPATH=y CONFIG_DEBUG_RWSEMS=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_LOCKDEP=y # CONFIG_DEBUG_LOCKDEP is not set CONFIG_DEBUG_ATOMIC_SLEEP=y # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_LOCK_TORTURE_TEST is not set # CONFIG_WW_MUTEX_SELFTEST is not set # end of Lock Debugging (spinlocks, mutexes, etc...) CONFIG_TRACE_IRQFLAGS=y CONFIG_TRACE_IRQFLAGS_NMI=y CONFIG_STACKTRACE=y # CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set # CONFIG_DEBUG_KOBJECT is not set # # Debug kernel data structures # # CONFIG_DEBUG_LIST is not set # CONFIG_DEBUG_PLIST is not set # CONFIG_DEBUG_SG is not set # CONFIG_DEBUG_NOTIFIERS is not set # CONFIG_BUG_ON_DATA_CORRUPTION is not set # end of Debug kernel data structures CONFIG_DEBUG_CREDENTIALS=y # # RCU Debugging # CONFIG_PROVE_RCU=y # CONFIG_RCU_PERF_TEST is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_RCU_REF_SCALE_TEST is not set CONFIG_RCU_CPU_STALL_TIMEOUT=60 # CONFIG_RCU_TRACE is not set # CONFIG_RCU_EQS_DEBUG is not set # end of RCU Debugging # CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set # CONFIG_DEBUG_BLOCK_EXT_DEVT is not set # CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set # CONFIG_LATENCYTOP is not set CONFIG_USER_STACKTRACE_SUPPORT=y CONFIG_NOP_TRACER=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_SYSCALL_TRACEPOINTS=y CONFIG_HAVE_FENTRY=y CONFIG_HAVE_C_RECORDMCOUNT=y CONFIG_TRACE_CLOCK=y CONFIG_RING_BUFFER=y CONFIG_EVENT_TRACING=y CONFIG_CONTEXT_SWITCH_TRACER=y CONFIG_PREEMPTIRQ_TRACEPOINTS=y CONFIG_TRACING=y CONFIG_GENERIC_TRACER=y CONFIG_TRACING_SUPPORT=y CONFIG_FTRACE=y CONFIG_BOOTTIME_TRACING=y CONFIG_FUNCTION_TRACER=y CONFIG_FUNCTION_GRAPH_TRACER=y CONFIG_DYNAMIC_FTRACE=y CONFIG_DYNAMIC_FTRACE_WITH_REGS=y CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y # CONFIG_FUNCTION_PROFILER is not set # CONFIG_STACK_TRACER is not set # CONFIG_IRQSOFF_TRACER is not set # CONFIG_PREEMPT_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_HWLAT_TRACER is not set # CONFIG_MMIOTRACE is not set CONFIG_FTRACE_SYSCALLS=y # CONFIG_TRACER_SNAPSHOT is not set CONFIG_BRANCH_PROFILE_NONE=y # CONFIG_PROFILE_ANNOTATED_BRANCHES is not set # CONFIG_PROFILE_ALL_BRANCHES is not set CONFIG_BLK_DEV_IO_TRACE=y CONFIG_KPROBE_EVENTS=y # CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set CONFIG_UPROBE_EVENTS=y CONFIG_BPF_EVENTS=y CONFIG_DYNAMIC_EVENTS=y CONFIG_PROBE_EVENTS=y CONFIG_BPF_KPROBE_OVERRIDE=y CONFIG_FTRACE_MCOUNT_RECORD=y # CONFIG_SYNTH_EVENTS is not set # CONFIG_HIST_TRIGGERS is not set # CONFIG_TRACE_EVENT_INJECT is not set # CONFIG_TRACEPOINT_BENCHMARK is not set # CONFIG_RING_BUFFER_BENCHMARK is not set # CONFIG_TRACE_EVAL_MAP_FILE is not set # CONFIG_FTRACE_STARTUP_TEST is not set # CONFIG_RING_BUFFER_STARTUP_TEST is not set # CONFIG_PREEMPTIRQ_DELAY_TEST is not set # CONFIG_KPROBE_EVENT_GEN_TEST is not set # CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KCSAN=y CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y # CONFIG_STRICT_DEVMEM is not set # # x86 Debugging # CONFIG_TRACE_IRQFLAGS_SUPPORT=y CONFIG_TRACE_IRQFLAGS_NMI_SUPPORT=y CONFIG_X86_VERBOSE_BOOTUP=y CONFIG_EARLY_PRINTK=y # CONFIG_EARLY_PRINTK_DBGP is not set # CONFIG_EARLY_PRINTK_USB_XDBC is not set # CONFIG_EFI_PGT_DUMP is not set # CONFIG_DEBUG_TLBFLUSH is not set # CONFIG_IOMMU_DEBUG is not set CONFIG_HAVE_MMIOTRACE_SUPPORT=y # CONFIG_X86_DECODER_SELFTEST is not set CONFIG_IO_DELAY_0X80=y # CONFIG_IO_DELAY_0XED is not set # CONFIG_IO_DELAY_UDELAY is not set # CONFIG_IO_DELAY_NONE is not set # CONFIG_DEBUG_BOOT_PARAMS is not set # CONFIG_CPA_DEBUG is not set # CONFIG_DEBUG_ENTRY is not set # CONFIG_DEBUG_NMI_SELFTEST is not set CONFIG_X86_DEBUG_FPU=y # CONFIG_PUNIT_ATOM_DEBUG is not set CONFIG_UNWINDER_ORC=y # CONFIG_UNWINDER_FRAME_POINTER is not set # CONFIG_UNWINDER_GUESS is not set # end of x86 Debugging # # Kernel Testing and Coverage # # CONFIG_KUNIT is not set # CONFIG_NOTIFIER_ERROR_INJECTION is not set CONFIG_FUNCTION_ERROR_INJECTION=y CONFIG_FAULT_INJECTION=y # CONFIG_FAILSLAB is not set # CONFIG_FAIL_PAGE_ALLOC is not set # CONFIG_FAIL_MAKE_REQUEST is not set # CONFIG_FAIL_IO_TIMEOUT is not set # CONFIG_FAIL_FUTEX is not set CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FAIL_FUNCTION=y CONFIG_ARCH_HAS_KCOV=y CONFIG_CC_HAS_SANCOV_TRACE_PC=y # CONFIG_KCOV is not set # CONFIG_RUNTIME_TESTING_MENU is not set # CONFIG_MEMTEST is not set # end of Kernel Testing and Coverage # end of Kernel hacking xdp-tools-1.6.1/.github/scripts/prepare_test_kernel.sh000077500000000000000000000015741514310632100230760ustar00rootroot00000000000000#!/bin/bash set -e IFS=- read KERNEL_UPSTREAM_VERSION KERNEL_PATCH_VERSION <<< $KERNEL_VERSION KERNEL_VERSION_COMPLETE="$KERNEL_VERSION".x86_64 PACKAGES_URL=https://kojipkgs.fedoraproject.org/packages/kernel/ PACKAGES_URL+="$KERNEL_UPSTREAM_VERSION"/"$KERNEL_PATCH_VERSION"/x86_64 for package in core modules modules-core modules-extra devel; do # modules-core package only exists for newer kernel versions, so continue if # download fails wget -nv "$PACKAGES_URL"/kernel-"$package"-"$KERNEL_VERSION_COMPLETE".rpm || continue rpm2cpio kernel-"$package"-"$KERNEL_VERSION_COMPLETE".rpm | cpio -di done find lib -name "*.xz" -exec xz -d {} \; mv lib/modules/"$KERNEL_VERSION_COMPLETE" kernel mkdir -p kernel/arch/x86/boot cp kernel/vmlinuz kernel/arch/x86/boot/bzImage cp kernel/config kernel/.config rsync -a usr/src/kernels/"$KERNEL_VERSION_COMPLETE"/ kernel/ find kernel xdp-tools-1.6.1/.github/scripts/prepare_test_tools.sh000077500000000000000000000001741514310632100227510ustar00rootroot00000000000000#!/bin/bash set -e echo ::group::Install xdp-test-harness sudo python3 -m pip install xdp_test_harness echo ::endgroup:: xdp-tools-1.6.1/.github/scripts/run_tests.sh000077500000000000000000000001541514310632100210600ustar00rootroot00000000000000#!/bin/bash export $(cat ENVVARS | xargs -d '\n') make test V=1 >> TEST_OUTPUT 2>&1 echo $? > TEST_RESULT xdp-tools-1.6.1/.github/scripts/run_tests_in_vm.sh000077500000000000000000000004721514310632100222530ustar00rootroot00000000000000#!/bin/bash ENVVARS="KERNEL_VERSION DID_UNSHARE CLANG" touch ENVVARS for v in $ENVVARS; do val=$(eval echo '$'$v) echo "$v=$val" >> ENVVARS done touch TEST_OUTPUT tail -f TEST_OUTPUT & sudo virtme-ng --run kernel --exec .github/scripts/run_tests.sh --rw --memory 2G kill %1 exit "$(cat TEST_RESULT)" xdp-tools-1.6.1/.github/workflows/000077500000000000000000000000001514310632100170415ustar00rootroot00000000000000xdp-tools-1.6.1/.github/workflows/covscan.yml000066400000000000000000000041751514310632100212270ustar00rootroot00000000000000name: coverity-scan on: schedule: - cron: '0 18 * * 0' # Sundays at 18:00 UTC push: branches: [ "coverity_scan" ] jobs: latest: permissions: contents: read runs-on: ubuntu-22.04 env: LLVM_VERSION: 19 CLANG: clang-19 steps: - name: Check out repository code uses: actions/checkout@v4 with: submodules: recursive - name: Prepare packages run: | sudo apt-get update sudo apt-get install zstd binutils-dev elfutils libpcap-dev libelf-dev gcc-multilib pkg-config wireshark tshark bpfcc-tools python3 python3-pip python3-setuptools qemu-kvm rpm2cpio libdw-dev libdwarf-dev - name: Prepare Clang run: | wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - echo "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-$LLVM_VERSION main" | sudo tee -a /etc/apt/sources.list sudo apt-get -qq update sudo apt-get -qq -y install clang-$LLVM_VERSION lld-$LLVM_VERSION llvm-$LLVM_VERSION - name: Download Coverity Build Tool run: | wget -q https://scan.coverity.com/download/cxx/linux64 --post-data "token=$TOKEN&project=xdp-project%2Fxdp-tools" -O cov-analysis-linux64.tar.gz mkdir cov-analysis-linux64 tar xzf cov-analysis-linux64.tar.gz --strip 1 -C cov-analysis-linux64 env: TOKEN: ${{ secrets.COVERITY_SCAN_TOKEN }} - name: Configure run: ./configure - name: Build with cov-build run: | export PATH=`pwd`/cov-analysis-linux64/bin:$PATH cov-build --dir cov-int make - name: Submit the result to Coverity Scan run: | tar czvf xdp-tools.tgz cov-int curl \ --form project=xdp-project/xdp-tools \ --form token=$TOKEN \ --form email=toke@redhat.com \ --form file=@xdp-tools.tgz \ --form version=trunk \ --form description="xdp-tools" \ https://scan.coverity.com/builds?project=xdp-project%2Fxdp-tools env: TOKEN: ${{ secrets.COVERITY_SCAN_TOKEN }} xdp-tools-1.6.1/.github/workflows/release.yml000066400000000000000000000011701514310632100212030ustar00rootroot00000000000000--- name: "tagged-release" on: push: tags: - "v*" jobs: tagged-release: name: "Tagged Release" runs-on: "ubuntu-latest" permissions: contents: write steps: - name: Check out repository code uses: actions/checkout@v4 with: submodules: recursive - name: "Create source archive" run: | ./mkarchive.sh - uses: "marvinpinto/action-automatic-releases@919008cf3f741b179569b7a6fb4d8860689ab7f0" #v1.2.1 with: repo_token: "${{ secrets.GITHUB_TOKEN }}" prerelease: false files: | *.tar.gz xdp-tools-1.6.1/.github/workflows/selftests.yml000066400000000000000000000043521514310632100216040ustar00rootroot00000000000000name: Selftests on: push: branches: [ main ] pull_request: branches: [ main ] jobs: selftest: permissions: contents: read pull-requests: write runs-on: ubuntu-24.04 strategy: matrix: KERNEL_VERSION: - "6.18.0-65.fc44" - "6.17.11-200.fc42" - "6.16.7-200.fc42" - "6.14.11-200.fc41" - "6.13.7-200.fc41" - "6.12.8-200.fc41" - "6.10.12-200.fc40" - "6.6.14-200.fc39" - "6.1.9-200.fc37" - "5.16.8-200.fc35" - "5.11.0-156.fc34" - "5.6.19-300.fc32" LLVM_VERSION: - 16 - 17 - 18 - 19 - 20 - 21 fail-fast: false env: KERNEL_VERSION: ${{ matrix.KERNEL_VERSION }} LLVM_VERSION: ${{ matrix.LLVM_VERSION }} CLANG: clang-${{ matrix.LLVM_VERSION }} LLVM_STRIP: llvm-strip-${{ matrix.LLVM_VERSION }} # can't use unshare on old kernels DID_UNSHARE: ${{ (startsWith(matrix.KERNEL_VERSION, '5.6') || startsWith(matrix.KERNEL_VERSION, '5.11')) && 1 || 0 }} steps: - name: Check out repository code uses: actions/checkout@v4 with: submodules: recursive - name: Prepare packages run: | sudo apt-get update sudo apt-get install zstd binutils-dev elfutils libpcap-dev libelf-dev libbpf-dev linux-tools-common gcc-multilib pkg-config wireshark tshark bpfcc-tools python3 python3-pip python3-setuptools qemu-kvm rpm2cpio libdw-dev libdwarf-dev libcap-ng-dev socat virtme-ng ndisc6 arping - name: Prepare Clang run: | wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - echo "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-$LLVM_VERSION main" | sudo tee -a /etc/apt/sources.list sudo apt-get -qq update sudo apt-get -qq -y install clang-$LLVM_VERSION lld-$LLVM_VERSION llvm-$LLVM_VERSION - name: Compile run: make - name: Prepare test tools run: .github/scripts/prepare_test_tools.sh - name: Prepare test kernel run: .github/scripts/prepare_test_kernel.sh - name: Run tests run: .github/scripts/run_tests_in_vm.sh xdp-tools-1.6.1/.gitignore000066400000000000000000000011051514310632100154310ustar00rootroot00000000000000# Prerequisites *.d # Object files *.o *.ko *.obj *.elf # Linker output *.ilk *.exp *.ll # Precompiled Headers *.gch *.pch # Libraries *.lib *.a *.la *.lo # Shared objects (inc. Windows DLLs) *.dll *.so *.so.* *.dylib # Executables *.exe *.out *.app *.i*86 *.x86_64 *.hex # Debug files *.dSYM/ *.su *.idb *.pdb # Kernel Module Compile Results *.mod* *.cmd .tmp_versions/ modules.order Module.symvers Mkfile.old dkms.conf config.mk xdp-dispatcher.c *.man *.rpm /xdp-tools-*.tar.gz .ccls-cache .clangd .cache compile_commands.json # BPF skeleton files *.skel.h .vscode xdp-tools-1.6.1/.gitmodules000066400000000000000000000001501514310632100156150ustar00rootroot00000000000000[submodule "libbpf"] path = lib/libbpf url = https://github.com/libbpf/libbpf.git ignore = untracked xdp-tools-1.6.1/.lgtm.yml000066400000000000000000000002211514310632100152030ustar00rootroot00000000000000extraction: cpp: after_prepare: - export RELAXED_LLVM_VERSION=1 path_classifiers: library: - lib/libbpf/*/* - lib/libbpf/* xdp-tools-1.6.1/LICENSE000066400000000000000000000003761514310632100144570ustar00rootroot00000000000000The code in this repository is licensed by a mix of GPL-2.0, LGPL-2.1 and BSD-2-Clause licenses, as indicated by the SPDX license headers in individual source files. The full text of these licenses is available in the files in the LICENSES subdirectory. xdp-tools-1.6.1/LICENSES/000077500000000000000000000000001514310632100146515ustar00rootroot00000000000000xdp-tools-1.6.1/LICENSES/BSD-2-Clause000066400000000000000000000031511514310632100165550ustar00rootroot00000000000000Valid-License-Identifier: BSD-2-Clause SPDX-URL: https://spdx.org/licenses/BSD-2-Clause.html Usage-Guide: To use the BSD 2-clause "Simplified" License put the following SPDX tag/value pair into a comment according to the placement guidelines in the licensing rules documentation: SPDX-License-Identifier: BSD-2-Clause License-Text: Copyright (c) . All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xdp-tools-1.6.1/LICENSES/GPL-2.0000066400000000000000000000445721514310632100155270ustar00rootroot00000000000000Valid-License-Identifier: GPL-2.0 Valid-License-Identifier: GPL-2.0-only Valid-License-Identifier: GPL-2.0+ Valid-License-Identifier: GPL-2.0-or-later SPDX-URL: https://spdx.org/licenses/GPL-2.0.html Usage-Guide: To use this license in source code, put one of the following SPDX tag/value pairs into a comment according to the placement guidelines in the licensing rules documentation. For 'GNU General Public License (GPL) version 2 only' use: SPDX-License-Identifier: GPL-2.0 or SPDX-License-Identifier: GPL-2.0-only For 'GNU General Public License (GPL) version 2 or any later version' use: SPDX-License-Identifier: GPL-2.0+ or SPDX-License-Identifier: GPL-2.0-or-later License-Text: GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. xdp-tools-1.6.1/LICENSES/LGPL-2.1000066400000000000000000000654251514310632100156440ustar00rootroot00000000000000Valid-License-Identifier: LGPL-2.1 Valid-License-Identifier: LGPL-2.1+ SPDX-URL: https://spdx.org/licenses/LGPL-2.1.html Usage-Guide: To use this license in source code, put one of the following SPDX tag/value pairs into a comment according to the placement guidelines in the licensing rules documentation. For 'GNU Lesser General Public License (LGPL) version 2.1 only' use: SPDX-License-Identifier: LGPL-2.1 For 'GNU Lesser General Public License (LGPL) version 2.1 or any later version' use: SPDX-License-Identifier: LGPL-2.1+ License-Text: GNU LESSER GENERAL PUBLIC LICENSE Version 2.1, February 1999 Copyright (C) 1991, 1999 Free Software Foundation, Inc. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. [This is the first released version of the Lesser GPL. It also counts as the successor of the GNU Library Public License, version 2, hence the version number 2.1.] Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public Licenses are intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This license, the Lesser General Public License, applies to some specially designated software packages--typically libraries--of the Free Software Foundation and other authors who decide to use it. You can use it too, but we suggest you first think carefully about whether this license or the ordinary General Public License is the better strategy to use in any particular case, based on the explanations below. When we speak of free software, we are referring to freedom of use, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish); that you receive source code or can get it if you want it; that you can change the software and use pieces of it in new free programs; and that you are informed that you can do these things. To protect your rights, we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights. These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it. For example, if you distribute copies of the library, whether gratis or for a fee, you must give the recipients all the rights that we gave you. You must make sure that they, too, receive or can get the source code. If you link other code with the library, you must provide complete object files to the recipients, so that they can relink them with the library after making changes to the library and recompiling it. And you must show them these terms so they know their rights. We protect your rights with a two-step method: (1) we copyright the library, and (2) we offer you this license, which gives you legal permission to copy, distribute and/or modify the library. To protect each distributor, we want to make it very clear that there is no warranty for the free library. Also, if the library is modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others. Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a restrictive license from a patent holder. Therefore, we insist that any patent license obtained for a version of the library must be consistent with the full freedom of use specified in this license. Most GNU software, including some libraries, is covered by the ordinary GNU General Public License. This license, the GNU Lesser General Public License, applies to certain designated libraries, and is quite different from the ordinary General Public License. We use this license for certain libraries in order to permit linking those libraries into non-free programs. When a program is linked with a library, whether statically or using a shared library, the combination of the two is legally speaking a combined work, a derivative of the original library. The ordinary General Public License therefore permits such linking only if the entire combination fits its criteria of freedom. The Lesser General Public License permits more lax criteria for linking other code with the library. We call this license the "Lesser" General Public License because it does Less to protect the user's freedom than the ordinary General Public License. It also provides other free software developers Less of an advantage over competing non-free programs. These disadvantages are the reason we use the ordinary General Public License for many libraries. However, the Lesser license provides advantages in certain special circumstances. For example, on rare occasions, there may be a special need to encourage the widest possible use of a certain library, so that it becomes a de-facto standard. To achieve this, non-free programs must be allowed to use the library. A more frequent case is that a free library does the same job as widely used non-free libraries. In this case, there is little to gain by limiting the free library to free software only, so we use the Lesser General Public License. In other cases, permission to use a particular library in non-free programs enables a greater number of people to use a large body of free software. For example, permission to use the GNU C Library in non-free programs enables many more people to use the whole GNU operating system, as well as its variant, the GNU/Linux operating system. Although the Lesser General Public License is Less protective of the users' freedom, it does ensure that the user of a program that is linked with the Library has the freedom and the wherewithal to run that program using a modified version of the Library. The precise terms and conditions for copying, distribution and modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run. TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License Agreement applies to any software library or other program which contains a notice placed by the copyright holder or other authorized party saying it may be distributed under the terms of this Lesser General Public License (also called "this License"). Each licensee is addressed as "you". A "library" means a collection of software functions and/or data prepared so as to be conveniently linked with application programs (which use some of those functions and data) to form executables. The "Library", below, refers to any such software library or work which has been distributed under these terms. A "work based on the Library" means either the Library or any derivative work under copyright law: that is to say, a work containing the Library or a portion of it, either verbatim or with modifications and/or translated straightforwardly into another language. (Hereinafter, translation is included without limitation in the term "modification".) "Source code" for a work means the preferred form of the work for making modifications to it. For a library, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the library. Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running a program using the Library is not restricted, and output from such a program is covered only if its contents constitute a work based on the Library (independent of the use of the Library in a tool for writing it). Whether that is true depends on what the Library does and what the program that uses the Library does. 1. You may copy and distribute verbatim copies of the Library's complete source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and distribute a copy of this License along with the Library. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) The modified work must itself be a software library. b) You must cause the files modified to carry prominent notices stating that you changed the files and the date of any change. c) You must cause the whole of the work to be licensed at no charge to all third parties under the terms of this License. d) If a facility in the modified Library refers to a function or a table of data to be supplied by an application program that uses the facility, other than as an argument passed when the facility is invoked, then you must make a good faith effort to ensure that, in the event an application does not supply such function or table, the facility still operates, and performs whatever part of its purpose remains meaningful. (For example, a function in a library to compute square roots has a purpose that is entirely well-defined independent of the application. Therefore, Subsection 2d requires that any application-supplied function or table used by this function must be optional: if the application does not supply it, the square root function must still compute square roots.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Library, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Library, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Library. In addition, mere aggregation of another work not based on the Library with the Library (or with a work based on the Library) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may opt to apply the terms of the ordinary GNU General Public License instead of this License to a given copy of the Library. To do this, you must alter all the notices that refer to this License, so that they refer to the ordinary GNU General Public License, version 2, instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices. Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy. This option is useful when you wish to copy part of the code of the Library into a program that is not a library. 4. You may copy and distribute the Library (or a portion or derivative of it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange. If distribution of object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place satisfies the requirement to distribute the source code, even though third parties are not compelled to copy the source along with the object code. 5. A program that contains no derivative of any portion of the Library, but is designed to work with the Library by being compiled or linked with it, is called a "work that uses the Library". Such a work, in isolation, is not a derivative work of the Library, and therefore falls outside the scope of this License. However, linking a "work that uses the Library" with the Library creates an executable that is a derivative of the Library (because it contains portions of the Library), rather than a "work that uses the library". The executable is therefore covered by this License. Section 6 states terms for distribution of such executables. When a "work that uses the Library" uses material from a header file that is part of the Library, the object code for the work may be a derivative work of the Library even though the source code is not. Whether this is true is especially significant if the work can be linked without the Library, or if the work is itself a library. The threshold for this to be true is not precisely defined by law. If such an object file uses only numerical parameters, data structure layouts and accessors, and small macros and small inline functions (ten lines or less in length), then the use of the object file is unrestricted, regardless of whether it is legally a derivative work. (Executables containing this object code plus portions of the Library will still fall under Section 6.) Otherwise, if the work is a derivative of the Library, you may distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself. 6. As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work under terms of your choice, provided that the terms permit modification of the work for the customer's own use and reverse engineering for debugging such modifications. You must give prominent notice with each copy of the work that the Library is used in it and that the Library and its use are covered by this License. You must supply a copy of this License. If the work during execution displays copyright notices, you must include the copyright notice for the Library among them, as well as a reference directing the user to the copy of this License. Also, you must do one of these things: a) Accompany the work with the complete corresponding machine-readable source code for the Library including whatever changes were used in the work (which must be distributed under Sections 1 and 2 above); and, if the work is an executable linked with the Library, with the complete machine-readable "work that uses the Library", as object code and/or source code, so that the user can modify the Library and then relink to produce a modified executable containing the modified Library. (It is understood that the user who changes the contents of definitions files in the Library will not necessarily be able to recompile the application to use the modified definitions.) b) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (1) uses at run time a copy of the library already present on the user's computer system, rather than copying library functions into the executable, and (2) will operate properly with a modified version of the library, if the user installs one, as long as the modified version is interface-compatible with the version that the work was made with. c) Accompany the work with a written offer, valid for at least three years, to give the same user the materials specified in Subsection 6a, above, for a charge no more than the cost of performing this distribution. d) If distribution of the work is made by offering access to copy from a designated place, offer equivalent access to copy the above specified materials from the same place. e) Verify that the user has already received a copy of these materials or that you have already sent this user a copy. For an executable, the required form of the "work that uses the Library" must include any data and utility programs needed for reproducing the executable from it. However, as a special exception, the materials to be distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. It may happen that this requirement contradicts the license restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute. 7. You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined library, provided that the separate distribution of the work based on the Library and of the other library facilities is otherwise permitted, and provided that you do these two things: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities. This must be distributed under the terms of the Sections above. b) Give prominent notice with the combined library of the fact that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 8. You may not copy, modify, sublicense, link with, or distribute the Library except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense, link with, or distribute the Library is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 9. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Library or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Library (or any work based on the Library), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Library or works based on it. 10. Each time you redistribute the Library (or any work based on the Library), the recipient automatically receives a license from the original licensor to copy, distribute, link with or modify the Library subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License. 11. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Library at all. For example, if a patent license would not permit royalty-free redistribution of the Library by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Library. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply, and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 12. If the distribution and/or use of the Library is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Library under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 13. The Free Software Foundation may publish revised and/or new versions of the Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation. 14. If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Libraries If you develop a new library, and you want it to be of the greatest possible use to the public, we recommend making it free software that everyone can redistribute and change. You can do so by permitting redistribution under these terms (or, alternatively, under the terms of the ordinary General Public License). To apply these terms, attach the following notices to the library. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. one line to give the library's name and an idea of what it does. Copyright (C) year name of author This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the library, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the library `Frob' (a library for tweaking knobs) written by James Random Hacker. signature of Ty Coon, 1 April 1990 Ty Coon, President of Vice That's all there is to it! xdp-tools-1.6.1/Makefile000066400000000000000000000041251514310632100151060ustar00rootroot00000000000000 # SPDX-License-Identifier: GPL-2.0 # Top level Makefile for xdp-tools ifeq ("$(origin V)", "command line") VERBOSE = $(V) endif ifndef VERBOSE VERBOSE = 0 endif ifeq ($(VERBOSE),0) MAKEFLAGS += --no-print-directory endif include version.mk include config.mk UTILS := xdp-filter xdp-loader xdp-dump ifneq ($(BPFTOOL),) UTILS += xdp-bench xdp-forward xdp-monitor xdp-trafficgen endif SUBDIRS := lib $(UTILS) .PHONY: check_submodule help clobber distclean clean install test libxdp $(SUBDIRS) all: $(SUBDIRS) lib: config.mk check_submodule @echo; echo $@; $(MAKE) -C $@ libxdp: config.mk check_submodule @echo; echo lib; $(MAKE) -C lib $@ libxdp_install: libxdp @$(MAKE) -C lib $@ $(UTILS): lib @echo; echo $@; $(MAKE) -C $@ help: @echo "Make Targets:" @echo " all - build binaries" @echo " clean - remove products of build" @echo " distclean - remove configuration and build" @echo " install - install binaries on local machine" @echo " test - run test suite" @echo " archive - create tarball of all sources" @echo "" @echo "Make Arguments:" @echo " V=[0|1] - set build verbosity level" config.mk: configure sh configure check_submodule: @if [ -d .git ] && `git submodule status lib/libbpf | grep -q '^+'`; then \ echo "" ;\ echo "** WARNING **: git submodule SHA-1 out-of-sync" ;\ echo " consider running: git submodule update" ;\ echo "" ;\ fi\ clobber: touch config.mk $(MAKE) clean rm -f config.mk cscope.* compile_commands.json distclean: clobber clean: check_submodule @for i in $(SUBDIRS); \ do $(MAKE) -C $$i clean; done install: all @for i in $(SUBDIRS); \ do $(MAKE) -C $$i install; done test: all @for i in lib/libxdp $(UTILS); do \ echo; echo test $$i; $(MAKE) -C $$i test; \ if [ $$? -ne 0 ]; then failed="y"; fi; \ done; \ if [ ! -z $$failed ]; then exit 1; fi archive: xdp-tools-$(TOOLS_VERSION).tar.gz .PHONY: xdp-tools-$(TOOLS_VERSION).tar.gz xdp-tools-$(TOOLS_VERSION).tar.gz: @./mkarchive.sh "$(TOOLS_VERSION)" compile_commands.json: clean compiledb make V=1 xdp-tools-1.6.1/README.org000066400000000000000000000033671514310632100151230ustar00rootroot00000000000000* xdp-tools - Library and utilities for use with XDP This repository contains the =libxdp= library for working with the eXpress Data Path facility of the Linux kernel, and a collection of utilities and example code that uses the library. The repository contains the following: - [[lib/libxdp/][lib/libxdp/]] - the =libxdp= library itself - can be built standalone using =make libxdp= - [[xdp-bench/][xdp-bench/]] - an XDP benchmarking tool - [[xdp-dump/][xdp-dump/]] - a tcpdump-like tool for capturing packets at the XDP layer - [[xdp-filter/][xdp-filter/]] - a simple packet filtering utility powered by XDP - [[xdp-forward/][xdp-forward/]] - an XDP forwarding plane - [[xdp-loader/][xdp-loader/]] - a command-line utility for loading XDP programs using =libxdp= - [[xdp-monitor/][xdp-monitor/]] - a simple XDP tracepoint monitoring tool - [[xdp-trafficgen/][xdp-trafficgen/]] - an XDP-based packet generator - [[headers/xdp/][headers/xdp/]] - reusable eBPF code snippets for XDP (installed in /usr/include/xdp by =make install=). - [[lib/util/][lib/util/]] - common code shared between the different utilities - [[packaging/][packaging/]] - files used for distro packaging - lib/libbpf/ - a git submodule with [[https://github.com/libbpf/libbpf][libbpf]], used if the system version is not recent enough To compile, first run =./configure=, then simply type =make=. Make sure you either have a sufficiently recent libbpf installed on your system, or that you pulled down the libbpf git submodule (=git submodule init && git submodule update=). For a general introduction to XDP, please see the [[https://github.com/xdp-project/xdp-tutorial][XDP tutorial]], and for more BPF and XDP examples, see the [[https://github.com/xdp-project/bpf-examples][bpf-examples repository]]. xdp-tools-1.6.1/configure000077500000000000000000000347211514310632100153620ustar00rootroot00000000000000#!/bin/sh # SPDX-License-Identifier: GPL-2.0 # This is not an autoconf generated configure # # Output file which is input to Makefile CONFIG_FINAL=config.mk CONFIG=".${CONFIG}.tmp" # Make a temp directory in build tree. TMPDIR=$(mktemp -d config.XXXXXX) trap 'status=$?; rm -rf $TMPDIR; rm -f $CONFIG; exit $status' EXIT HUP INT QUIT TERM check_opts() { : ${PRODUCTION:=0} : ${DYNAMIC_LIBXDP:=0} : ${MAX_DISPATCHER_ACTIONS:=10} : ${BPF_TARGET:=bpf} echo "PRODUCTION:=${PRODUCTION}" >>$CONFIG echo "DYNAMIC_LIBXDP:=${DYNAMIC_LIBXDP}" >>$CONFIG echo "MAX_DISPATCHER_ACTIONS:=${MAX_DISPATCHER_ACTIONS}" >>$CONFIG echo "BPF_TARGET:=${BPF_TARGET}" >>$CONFIG } find_tool() { local tool_name local tool_path local v tool_name="$1" tool_path="$2" if [ "$tool_name" != "$tool_path" ] || command -v "$tool_path" >/dev/null 2>&1; then echo $tool_path return 0 fi # we're looking for a binary with the same name as tool_name; try version # suffixes in order until we find one for v in 17 16 15 14 13 12 11; do tool_path="${tool_name}-$v" if command -v "$tool_path" >/dev/null 2>&1; then echo $tool_path return 0 fi done # Fall back to supplied default, check in caller will error out echo $tool_name } check_toolchain() { local emacs_version local clang_version local bpftool_version : ${PKG_CONFIG:=pkg-config} : ${CC=gcc} : ${OBJCOPY=objcopy} : ${CLANG=clang} : ${M4=m4} : ${EMACS=emacs} : ${BPFTOOL=bpftool} : ${READELF=readelf} : ${ARCH_INCLUDES=} : ${ARCH_NAME=} CLANG=$(find_tool clang "$CLANG") for TOOL in $PKG_CONFIG $CC $OBJCOPY $CLANG $M4 $READELF; do if [ ! $(command -v ${TOOL} 2>/dev/null) ]; then echo "*** ERROR: Cannot find tool ${TOOL}" ; exit 1; fi; done ARCH_NAME=$($CC -print-multiarch 2>/dev/null) clang_version=$($CLANG --version | sed -nE 's/.*clang version ([[:digit:]]+).*/\1/p') if [ "$?" -ne "0" ]; then echo "*** ERROR: Couldn't execute '$CLANG --version'" exit 1 fi echo "Found clang binary '$CLANG' with version $clang_version (from '$($CLANG --version | head -n 1)')" if [ "$clang_version" -lt "11" ]; then echo "*** ERROR: Need LLVM version 11+, '$CLANG' is version $clang_version" [ -n "$RELAXED_LLVM_VERSION" ] || exit 1 fi if ! command -v $EMACS >/dev/null 2>&1; then EMACS="" else emacs_major=$($EMACS -Q --batch --eval='(message "%s" emacs-major-version)' 2>&1) if [ -n "$emacs_major" ] && [ "$emacs_major" -ge 26 ]; then echo "using emacs: $EMACS, version $emacs_major" else echo "not using emacs: $EMACS, as it is too old (wanted version >=26, got $emacs_major)" EMACS="" fi fi if [ -z "$EMACS" ] && [ "${FORCE_EMACS:-0}" -eq "1" ]; then echo "FORCE_EMACS is set, but no usable emacs found on system" rm -f "$CONFIG" exit 1 fi if command -v $BPFTOOL &>/dev/null && $BPFTOOL gen help 2>&1 | grep 'gen skeleton.*name' > /dev/null; then bpftool_version=$($BPFTOOL version | head -n 1) echo "using $bpftool_version" else echo "bpftool not found or doesn't support skeleton generation; not building all tools" BPFTOOL= fi if [ -z "$ARCH_INCLUDES" ] && [ -n "$ARCH_NAME" ]; then for dir in $(echo | $CC -Wp,-v -E - 2>&1 | grep '^ '); do local idir idir="${dir}/${ARCH_NAME}/" [ -d "$idir" ] && ARCH_INCLUDES="-I${idir} $ARCH_INCLUDES" done fi echo "PKG_CONFIG:=${PKG_CONFIG}" >>$CONFIG echo "CC:=${CC}" >>$CONFIG echo "OBJCOPY:=${OBJCOPY}" >>$CONFIG echo "CLANG:=${CLANG}" >>$CONFIG echo "M4:=${M4}" >>$CONFIG echo "EMACS:=${EMACS}" >>$CONFIG echo "ARCH_INCLUDES:=$ARCH_INCLUDES" >> $CONFIG echo "READELF:=${READELF}" >> $CONFIG echo "BPFTOOL:=${BPFTOOL}" >> $CONFIG [ -n "$BPFTOOL" ] && echo "HAVE_FEATURES+=BPFTOOL" >>"$CONFIG" } check_zlib() { if ${PKG_CONFIG} zlib --exists; then echo "HAVE_ZLIB:=y" >>$CONFIG echo "yes" echo 'CFLAGS += -DHAVE_ZLIB' `${PKG_CONFIG} zlib --cflags` >> $CONFIG echo 'LDLIBS += ' `${PKG_CONFIG} zlib --libs` >>$CONFIG else echo "missing - this is required" return 1 fi } check_elf() { if ${PKG_CONFIG} libelf --exists; then echo "HAVE_ELF:=y" >>$CONFIG echo "yes" echo 'CFLAGS += -DHAVE_ELF' `${PKG_CONFIG} libelf --cflags` >> $CONFIG echo 'LDLIBS += ' `${PKG_CONFIG} libelf --libs` >>$CONFIG else echo "missing - this is required" return 1 fi } check_pcap() { local libpcap_err if ${PKG_CONFIG} libpcap --exists; then LIBPCAP_CFLAGS=$(${PKG_CONFIG} libpcap --cflags) LIBPCAP_LDLIBS=$(${PKG_CONFIG} libpcap --libs) else LIBPCAP_CFLAGS="" LIBPCAP_LDLIBS="-lpcap" fi cat >$TMPDIR/libpcaptest.c < #include int main(int argc, char **argv) { pcap_t *pcap = pcap_open_live("ifname", 100, 1, 1000, NULL); return 0; } EOF libpcap_err=$($CC -o $TMPDIR/libpcaptest $TMPDIR/libpcaptest.c $LIBPCAP_CFLAGS $LIBPCAP_LDLIBS $LDFLAGS 2>&1) if [ "$?" -eq "0" ]; then echo "HAVE_PCAP:=y" >>$CONFIG [ -n "$LIBPCAP_CFLAGS" ] && echo 'CFLAGS += ' $LIBPCAP_CFLAGS >> $CONFIG echo "yes" else echo "missing - this is required" echo "error: $libpcap_err" return 1 fi } check_cap_ng() { if ${PKG_CONFIG} libcap-ng --exists; then echo "HAVE_CAP_NG:=y" >>$CONFIG echo "yes" echo 'CAP_NG_CFLAGS:='`${PKG_CONFIG} libcap-ng --cflags` >> $CONFIG echo 'CAP_NG_LDLIBS:='`${PKG_CONFIG} libcap-ng --libs` >>$CONFIG else echo "no" fi } check_libbpf_function() { local FUNCTION_NAME local FUNCTION_ARGS local FUNCTION_DEFS local LIBBPF_CFLAGS local LIBBPF_LDLIBS local config_var FUNCTION_NAME="$1" FUNCTION_ARGS="$2" FUNCTION_DEFS="$3" LIBBPF_CFLAGS="$4" LIBBPF_LDLIBS="$5" config_var="LIBBPF_$(echo $FUNCTION_NAME | tr 'a-z' 'A-Z')" echo -n " $FUNCTION_NAME support: " # If LIBBPF_LDLIBS is empty that means we're using the submodule version of # libbpf. We know it does support all the APIs we're testing for, so we hard # code it as supported. We can't actually run the check as the embedded # libbpf.a has not been built at configure time. if [ -z "$LIBBPF_LDLIBS" ]; then echo "HAVE_FEATURES+=${config_var}" >>"$CONFIG" echo "yes (submodule)" return 0; fi # If this is set we don't try to link against libbpf, as it may be in a # different submodule and have not been built yet. Instead, we'll copy over # the header files from the libbpf sources so those are used first, # triggering a compile error if the function we are testing for is missing. if [ -n "$LIBBPF_UNBUILT" ]; then LIBBPF_LDLIBS="-Xlinker --unresolved-symbols=ignore-in-object-files" LIBBPF_CFLAGS="-I$TMPDIR/include" mkdir -p "$TMPDIR/include" cp -r headers/bpf headers/linux headers/xdp "$TMPDIR/include/" cp "$LIBBPF_DIR"/src/bpf.h "$LIBBPF_DIR"/src/btf.h "$LIBBPF_DIR"/src/libbpf*.h "$TMPDIR/include/bpf" [ "$?" -eq 0 ] || return fi cat >$TMPDIR/libbpftest.c < #include #include int main(int argc, char **argv) { ${FUNCTION_DEFS}; ${FUNCTION_NAME}${FUNCTION_ARGS}; return 0; } EOF compile_cmd="$CC -o $TMPDIR/libbpftest $TMPDIR/libbpftest.c -Werror $LIBBPF_CFLAGS $CFLAGS $LIBBPF_LDLIBS $LDFLAGS" libbpf_err=$($compile_cmd 2>&1) if [ "$?" -eq "0" ]; then echo "HAVE_FEATURES+=${config_var}" >>"$CONFIG" echo "yes" else echo "no" fi if [ -n "$DEBUG_CONFIGURE" ]; then echo " $compile_cmd" echo "${libbpf_err}" | sed 's/^/ /gm' fi } check_libbpf_functions() { local LIBBPF_CFLAGS local LIBBPF_LDLIBS LIBBPF_CFLAGS="$1" LIBBPF_LDLIBS="$2" check_libbpf_function "perf_buffer__consume" "(NULL)" "" "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" check_libbpf_function "btf__load_from_kernel_by_id" "(0)" "" "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" check_libbpf_function "btf__type_cnt" "(NULL)" "" "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" check_libbpf_function "bpf_object__next_map" "(NULL, NULL)" "" "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" check_libbpf_function "bpf_object__next_program" "(NULL, NULL)" "" "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" check_libbpf_function "bpf_program__insn_cnt" "(NULL)" "" "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" check_libbpf_function "bpf_program__type" "(NULL)" "" "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" check_libbpf_function "bpf_program__flags" "(NULL)" "" "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" check_libbpf_function "bpf_program__expected_attach_type" "(NULL)" "" "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" check_libbpf_function "bpf_map_create" "(0, NULL, 0, 0, 0, NULL)" "" "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" check_libbpf_function "perf_buffer__new_raw" "(0, 0, NULL, NULL, NULL, NULL)" "" "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" check_libbpf_function "bpf_xdp_attach" "(0, 0, 0, NULL)" "" "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" check_libbpf_function "bpf_map__set_autocreate" "(NULL, false)" "" "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" check_libbpf_function "bpf_prog_test_run_opts" "(0, &opts)" "DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, .batch_size = 1)" "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" check_libbpf_function "bpf_xdp_query" "(0, 0, &opts)" "DECLARE_LIBBPF_OPTS(bpf_xdp_query_opts, opts, .feature_flags = 1)" "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" } get_libbpf_version() { local libbpf_dir local version libbpf_dir="$1" if [ -f "${libbpf_dir}/libbpf.map" ]; then version=$(grep -oE '^LIBBPF_([0-9.]+)' "${libbpf_dir}/libbpf.map" | sort -rV | head -n1 | cut -d'_' -f2) else version=unknown fi echo $version } check_libbpf() { local libbpf_err if [ "${FORCE_SUBDIR_LIBBPF:-0}" -ne "1" ] && ${PKG_CONFIG} libbpf --exists && [ -z "$LIBBPF_DIR" ]; then LIBBPF_CFLAGS=$(${PKG_CONFIG} libbpf --cflags) LIBBPF_LDLIBS=$(${PKG_CONFIG} libbpf --libs) LIBBPF_VERSION=$(${PKG_CONFIG} libbpf --modversion) cat >$TMPDIR/libbpftest.c < #include #include int main(int argc, char **argv) { void *ptr = NULL; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, .pin_root_path = NULL); DECLARE_LIBBPF_OPTS(bpf_link_create_opts, lopts, .target_btf_id = 0); (void) bpf_object__open_file(NULL, &opts); (void) bpf_program__name(ptr); (void) bpf_map__set_initial_value(ptr, ptr, 0); return 0; } EOF libbpf_err=$($CC -o $TMPDIR/libbpftest $TMPDIR/libbpftest.c -Werror $LIBBPF_CFLAGS $CFLAGS $LIBBPF_LDLIBS $LDFLAGS 2>&1) if [ "$?" -eq "0" ]; then echo "SYSTEM_LIBBPF:=y" >>$CONFIG echo "LIBBPF_VERSION=$LIBBPF_VERSION" >>$CONFIG echo 'CFLAGS += ' $LIBBPF_CFLAGS >> $CONFIG echo 'LDLIBS += ' $LIBBPF_LDLIBS >>$CONFIG echo 'OBJECT_LIBBPF = ' >>$CONFIG echo "system v$LIBBPF_VERSION" check_libbpf_functions "$LIBBPF_CFLAGS" "$LIBBPF_LDLIBS" return 0 fi else libbpf_err="${PKG_CONFIG} couldn't find libbpf" fi if [ "${FORCE_SYSTEM_LIBBPF:-0}" -eq "1" ]; then echo "FORCE_SYSTEM_LIBBPF is set, but no usable libbpf found on system" echo "error: $libbpf_err" rm -f "$CONFIG" exit 1 fi if [ -n "$LIBBPF_DIR" ]; then [ -z "$LIBBPF_INCLUDE_DIR" ] && LIBBPF_INCLUDE_DIR="$(readlink -m ${LIBBPF_DIR}/include)" [ -z "$LIBBPF_LIB_DIR" ] && LIBBPF_LIB_DIR="$(readlink -m ${LIBBPF_DIR}/src)" LIBBPF_VERSION=$(get_libbpf_version "$LIBBPF_DIR/src") OBJECT_LIBBPF= echo "custom v$LIBBPF_VERSION" check_libbpf_functions "-I${LIBBPF_INCLUDE_DIR}" "-L${LIBBPF_LIB_DIR} -l:libbpf.a" else if ! [ -d "lib/libbpf/src" ] && [ -f ".gitmodules" ] && [ -e ".git" ]; then git submodule init && git submodule update fi LIBBPF_VERSION=$(get_libbpf_version "lib/libbpf/src") LIBBPF_INCLUDE_DIR='$(LIB_DIR)/libbpf/src/root/include' LIBBPF_LIB_DIR='$(LIB_DIR)/libbpf/src' OBJECT_LIBBPF="${LIBBPF_LIB_DIR}/libbpf.a" echo "submodule v$LIBBPF_VERSION" check_libbpf_functions "" "" fi echo "SYSTEM_LIBBPF:=n" >> $CONFIG echo "LIBBPF_VERSION=$LIBBPF_VERSION" >>$CONFIG echo "CFLAGS += -I${LIBBPF_INCLUDE_DIR}" >>$CONFIG echo "BPF_CFLAGS += -I${LIBBPF_INCLUDE_DIR}" >>$CONFIG echo "LDFLAGS += -L${LIBBPF_LIB_DIR}" >>$CONFIG echo 'LDLIBS += -l:libbpf.a' >>$CONFIG echo "OBJECT_LIBBPF = ${OBJECT_LIBBPF}" >>$CONFIG echo -n "zlib support: " check_zlib || exit 1 echo -n "ELF support: " check_elf || exit 1 echo -n "pcap support: " check_pcap || exit 1 } check_secure_getenv() { cat >$TMPDIR/secure_getenv.c < int main(int argc, char **argv) { secure_getenv("test"); return 0; } EOF secure_getenv_err=$($CC -o $TMPDIR/secure_getenv $TMPDIR/secure_getenv.c 2>&1) if [ "$?" -eq "0" ]; then echo "HAVE_FEATURES += SECURE_GETENV" >>"$CONFIG" echo "yes" else echo "no" fi } quiet_config() { cat <$CONFIG quiet_config >> $CONFIG check_opts check_toolchain echo -n "libbpf support: " check_libbpf echo -n "secure_getenv support: " check_secure_getenv echo -n "cap-ng support: " check_cap_ng if [ -n "$KERNEL_HEADERS" ]; then echo "kernel headers: $KERNEL_HEADERS" echo "CFLAGS += -I$KERNEL_HEADERS" >>$CONFIG echo "BPF_CFLAGS += -I$KERNEL_HEADERS" >>$CONFIG fi mv $CONFIG $CONFIG_FINAL xdp-tools-1.6.1/headers/000077500000000000000000000000001514310632100150575ustar00rootroot00000000000000xdp-tools-1.6.1/headers/bpf/000077500000000000000000000000001514310632100156265ustar00rootroot00000000000000xdp-tools-1.6.1/headers/bpf/bpf_trace_helpers.h000066400000000000000000000127011514310632100214470ustar00rootroot00000000000000/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ #ifndef __BPF_TRACE_HELPERS_H #define __BPF_TRACE_HELPERS_H #include #define ___bpf_concat(a, b) a ## b #define ___bpf_apply(fn, n) ___bpf_concat(fn, n) #define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N #define ___bpf_narg(...) \ ___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) #define ___bpf_empty(...) \ ___bpf_nth(_, ##__VA_ARGS__, N, N, N, N, N, N, N, N, N, N, 0) #define ___bpf_ctx_cast0() ctx #define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0] #define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1] #define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2] #define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3] #define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4] #define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5] #define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6] #define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7] #define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8] #define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9] #define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10] #define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11] #define ___bpf_ctx_cast(args...) \ ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args) /* * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and * similar kinds of BPF programs, that accept input arguments as a single * pointer to untyped u64 array, where each u64 can actually be a typed * pointer or integer of different size. Instead of requring user to write * manual casts and work with array elements by index, BPF_PROG macro * allows user to declare a list of named and typed input arguments in the * same syntax as for normal C function. All the casting is hidden and * performed transparently, while user code can just assume working with * function arguments of specified type and name. * * Original raw context argument is preserved as well as 'ctx' argument. * This is useful when using BPF helpers that expect original context * as one of the parameters (e.g., for bpf_perf_event_output()). */ #define BPF_PROG(name, args...) \ name(unsigned long long *ctx); \ static __always_inline typeof(name(0)) \ ____##name(unsigned long long *ctx, ##args); \ typeof(name(0)) name(unsigned long long *ctx) \ { \ _Pragma("GCC diagnostic push") \ _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ return ____##name(___bpf_ctx_cast(args)); \ _Pragma("GCC diagnostic pop") \ } \ static __always_inline typeof(name(0)) \ ____##name(unsigned long long *ctx, ##args) struct pt_regs; #define ___bpf_kprobe_args0() ctx #define ___bpf_kprobe_args1(x) \ ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx) #define ___bpf_kprobe_args2(x, args...) \ ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx) #define ___bpf_kprobe_args3(x, args...) \ ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx) #define ___bpf_kprobe_args4(x, args...) \ ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx) #define ___bpf_kprobe_args5(x, args...) \ ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx) #define ___bpf_kprobe_args(args...) \ ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args) /* * BPF_KPROBE serves the same purpose for kprobes as BPF_PROG for * tp_btf/fentry/fexit BPF programs. It hides the underlying platform-specific * low-level way of getting kprobe input arguments from struct pt_regs, and * provides a familiar typed and named function arguments syntax and * semantics of accessing kprobe input paremeters. * * Original struct pt_regs* context is preserved as 'ctx' argument. This might * be necessary when using BPF helpers like bpf_perf_event_output(). */ #define BPF_KPROBE(name, args...) \ name(struct pt_regs *ctx); \ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args);\ typeof(name(0)) name(struct pt_regs *ctx) \ { \ _Pragma("GCC diagnostic push") \ _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ return ____##name(___bpf_kprobe_args(args)); \ _Pragma("GCC diagnostic pop") \ } \ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) #define ___bpf_kretprobe_args0() ctx #define ___bpf_kretprobe_argsN(x, args...) \ ___bpf_kprobe_args(args), (void *)PT_REGS_RET(ctx) #define ___bpf_kretprobe_args(args...) \ ___bpf_apply(___bpf_kretprobe_args, ___bpf_empty(args))(args) /* * BPF_KRETPROBE is similar to BPF_KPROBE, except, in addition to listing all * input kprobe arguments, one last extra argument has to be specified, which * captures kprobe return value. */ #define BPF_KRETPROBE(name, args...) \ name(struct pt_regs *ctx); \ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args);\ typeof(name(0)) name(struct pt_regs *ctx) \ { \ _Pragma("GCC diagnostic push") \ _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ return ____##name(___bpf_kretprobe_args(args)); \ _Pragma("GCC diagnostic pop") \ } \ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) #endif xdp-tools-1.6.1/headers/bpf/vmlinux.h000066400000000000000000000007061514310632100175040ustar00rootroot00000000000000#ifndef __VMLINUX_H__ #define __VMLINUX_H__ #ifndef BPF_NO_PRESERVE_ACCESS_INDEX #pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record) #endif struct net_device { int ifindex; }; struct xdp_cpumap_stats { unsigned int redirect; unsigned int pass; unsigned int drop; }; struct bpf_prog { }; struct bpf_map { }; #ifndef BPF_NO_PRESERVE_ACCESS_INDEX #pragma clang attribute pop #endif #endif /* __VMLINUX_H__ */ xdp-tools-1.6.1/headers/linux/000077500000000000000000000000001514310632100162165ustar00rootroot00000000000000xdp-tools-1.6.1/headers/linux/bpf.h000066400000000000000000010100251514310632100171350ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ #ifndef _UAPI__LINUX_BPF_H__ #define _UAPI__LINUX_BPF_H__ #include #include /* Extended instruction set based on top of classic BPF */ /* instruction classes */ #define BPF_JMP32 0x06 /* jmp mode in word width */ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ #define BPF_DW 0x18 /* double word (64-bit) */ #define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ #define BPF_XADD 0xc0 /* exclusive add - legacy name */ /* alu/jmp fields */ #define BPF_MOV 0xb0 /* mov reg to reg */ #define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ /* change endianness of a register */ #define BPF_END 0xd0 /* flags for endianness conversion: */ #define BPF_TO_LE 0x00 /* convert to little-endian */ #define BPF_TO_BE 0x08 /* convert to big-endian */ #define BPF_FROM_LE BPF_TO_LE #define BPF_FROM_BE BPF_TO_BE /* jmp encodings */ #define BPF_JNE 0x50 /* jump != */ #define BPF_JLT 0xa0 /* LT is unsigned, '<' */ #define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ #define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ #define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ #define BPF_JSLT 0xc0 /* SLT is signed, '<' */ #define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ /* atomic op type fields (stored in immediate) */ #define BPF_FETCH 0x01 /* not an opcode on its own, used to build others */ #define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ #define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ /* Register numbers */ enum { BPF_REG_0 = 0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5, BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9, BPF_REG_10, __MAX_BPF_REG, }; /* BPF has 10 general purpose 64-bit registers and stack frame. */ #define MAX_BPF_REG __MAX_BPF_REG struct bpf_insn { __u8 code; /* opcode */ __u8 dst_reg:4; /* dest register */ __u8 src_reg:4; /* source register */ __s16 off; /* signed offset */ __s32 imm; /* signed immediate constant */ }; /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ __u8 data[0]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { __u64 cgroup_inode_id; /* cgroup inode id */ __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; enum bpf_cgroup_iter_order { BPF_CGROUP_ITER_ORDER_UNSPEC = 0, BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */ BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ }; union bpf_iter_link_info { struct { __u32 map_fd; } map; struct { enum bpf_cgroup_iter_order order; /* At most one of cgroup_fd and cgroup_id can be non-zero. If * both are zero, the walk starts from the default cgroup v2 * root. For walking v1 hierarchy, one should always explicitly * specify cgroup_fd. */ __u32 cgroup_fd; __u64 cgroup_id; } cgroup; /* Parameters of task iterators. */ struct { __u32 tid; __u32 pid; __u32 pid_fd; } task; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ /** * DOC: eBPF Syscall Preamble * * The operation to be performed by the **bpf**\ () system call is determined * by the *cmd* argument. Each operation takes an accompanying argument, * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see * below). The size argument is the size of the union pointed to by *attr*. */ /** * DOC: eBPF Syscall Commands * * BPF_MAP_CREATE * Description * Create a map and return a file descriptor that refers to the * map. The close-on-exec file descriptor flag (see **fcntl**\ (2)) * is automatically enabled for the new file descriptor. * * Applying **close**\ (2) to the file descriptor returned by * **BPF_MAP_CREATE** will delete the map (but see NOTES). * * Return * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * * BPF_MAP_LOOKUP_ELEM * Description * Look up an element with a given *key* in the map referred to * by the file descriptor *map_fd*. * * The *flags* argument may be specified as one of the * following: * * **BPF_F_LOCK** * Look up the value of a spin-locked map without * returning the lock. This must be specified if the * elements contain a spinlock. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * BPF_MAP_UPDATE_ELEM * Description * Create or update an element (key/value pair) in a specified map. * * The *flags* argument should be specified as one of the * following: * * **BPF_ANY** * Create a new element or update an existing element. * **BPF_NOEXIST** * Create a new element only if it did not exist. * **BPF_EXIST** * Update an existing element. * **BPF_F_LOCK** * Update a spin_lock-ed map element. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, * **E2BIG**, **EEXIST**, or **ENOENT**. * * **E2BIG** * The number of elements in the map reached the * *max_entries* limit specified at map creation time. * **EEXIST** * If *flags* specifies **BPF_NOEXIST** and the element * with *key* already exists in the map. * **ENOENT** * If *flags* specifies **BPF_EXIST** and the element with * *key* does not exist in the map. * * BPF_MAP_DELETE_ELEM * Description * Look up and delete an element by key in a specified map. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * BPF_MAP_GET_NEXT_KEY * Description * Look up an element by key in a specified map and return the key * of the next element. Can be used to iterate over all elements * in the map. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * The following cases can be used to iterate over all elements of * the map: * * * If *key* is not found, the operation returns zero and sets * the *next_key* pointer to the key of the first element. * * If *key* is found, the operation returns zero and sets the * *next_key* pointer to the key of the next element. * * If *key* is the last element, returns -1 and *errno* is set * to **ENOENT**. * * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or * **EINVAL** on error. * * BPF_PROG_LOAD * Description * Verify and load an eBPF program, returning a new file * descriptor associated with the program. * * Applying **close**\ (2) to the file descriptor returned by * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES). * * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is * automatically enabled for the new file descriptor. * * Return * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * * BPF_OBJ_PIN * Description * Pin an eBPF program or map referred by the specified *bpf_fd* * to the provided *pathname* on the filesystem. * * The *pathname* argument must not contain a dot ("."). * * On success, *pathname* retains a reference to the eBPF object, * preventing deallocation of the object when the original * *bpf_fd* is closed. This allow the eBPF object to live beyond * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent * process. * * Applying **unlink**\ (2) or similar calls to the *pathname* * unpins the object from the filesystem, removing the reference. * If no other file descriptors or filesystem nodes refer to the * same object, it will be deallocated (see NOTES). * * The filesystem type for the parent directory of *pathname* must * be **BPF_FS_MAGIC**. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * BPF_OBJ_GET * Description * Open a file descriptor for the eBPF object pinned to the * specified *pathname*. * * Return * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * * BPF_PROG_ATTACH * Description * Attach an eBPF program to a *target_fd* at the specified * *attach_type* hook. * * The *attach_type* specifies the eBPF attachment point to * attach the program to, and must be one of *bpf_attach_type* * (see below). * * The *attach_bpf_fd* must be a valid file descriptor for a * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap * or sock_ops type corresponding to the specified *attach_type*. * * The *target_fd* must be a valid file descriptor for a kernel * object which depends on the attach type of *attach_bpf_fd*: * * **BPF_PROG_TYPE_CGROUP_DEVICE**, * **BPF_PROG_TYPE_CGROUP_SKB**, * **BPF_PROG_TYPE_CGROUP_SOCK**, * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, * **BPF_PROG_TYPE_CGROUP_SYSCTL**, * **BPF_PROG_TYPE_SOCK_OPS** * * Control Group v2 hierarchy with the eBPF controller * enabled. Requires the kernel to be compiled with * **CONFIG_CGROUP_BPF**. * * **BPF_PROG_TYPE_FLOW_DISSECTOR** * * Network namespace (eg /proc/self/ns/net). * * **BPF_PROG_TYPE_LIRC_MODE2** * * LIRC device path (eg /dev/lircN). Requires the kernel * to be compiled with **CONFIG_BPF_LIRC_MODE2**. * * **BPF_PROG_TYPE_SK_SKB**, * **BPF_PROG_TYPE_SK_MSG** * * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**). * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * BPF_PROG_DETACH * Description * Detach the eBPF program associated with the *target_fd* at the * hook specified by *attach_type*. The program must have been * previously attached using **BPF_PROG_ATTACH**. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * BPF_PROG_TEST_RUN * Description * Run the eBPF program associated with the *prog_fd* a *repeat* * number of times against a provided program context *ctx_in* and * data *data_in*, and return the modified program context * *ctx_out*, *data_out* (for example, packet data), result of the * execution *retval*, and *duration* of the test run. * * The sizes of the buffers provided as input and output * parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must * be provided in the corresponding variables *ctx_size_in*, * *ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any * of these parameters are not provided (ie set to NULL), the * corresponding size field must be zero. * * Some program types have particular requirements: * * **BPF_PROG_TYPE_SK_LOOKUP** * *data_in* and *data_out* must be NULL. * * **BPF_PROG_TYPE_RAW_TRACEPOINT**, * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** * * *ctx_out*, *data_in* and *data_out* must be NULL. * *repeat* must be zero. * * BPF_PROG_RUN is an alias for BPF_PROG_TEST_RUN. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * **ENOSPC** * Either *data_size_out* or *ctx_size_out* is too small. * **ENOTSUPP** * This command is not supported by the program type of * the program referred to by *prog_fd*. * * BPF_PROG_GET_NEXT_ID * Description * Fetch the next eBPF program currently loaded into the kernel. * * Looks for the eBPF program with an id greater than *start_id* * and updates *next_id* on success. If no other eBPF programs * remain with ids higher than *start_id*, returns -1 and sets * *errno* to **ENOENT**. * * Return * Returns zero on success. On error, or when no id remains, -1 * is returned and *errno* is set appropriately. * * BPF_MAP_GET_NEXT_ID * Description * Fetch the next eBPF map currently loaded into the kernel. * * Looks for the eBPF map with an id greater than *start_id* * and updates *next_id* on success. If no other eBPF maps * remain with ids higher than *start_id*, returns -1 and sets * *errno* to **ENOENT**. * * Return * Returns zero on success. On error, or when no id remains, -1 * is returned and *errno* is set appropriately. * * BPF_PROG_GET_FD_BY_ID * Description * Open a file descriptor for the eBPF program corresponding to * *prog_id*. * * Return * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * * BPF_MAP_GET_FD_BY_ID * Description * Open a file descriptor for the eBPF map corresponding to * *map_id*. * * Return * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * * BPF_OBJ_GET_INFO_BY_FD * Description * Obtain information about the eBPF object corresponding to * *bpf_fd*. * * Populates up to *info_len* bytes of *info*, which will be in * one of the following formats depending on the eBPF object type * of *bpf_fd*: * * * **struct bpf_prog_info** * * **struct bpf_map_info** * * **struct bpf_btf_info** * * **struct bpf_link_info** * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * BPF_PROG_QUERY * Description * Obtain information about eBPF programs associated with the * specified *attach_type* hook. * * The *target_fd* must be a valid file descriptor for a kernel * object which depends on the attach type of *attach_bpf_fd*: * * **BPF_PROG_TYPE_CGROUP_DEVICE**, * **BPF_PROG_TYPE_CGROUP_SKB**, * **BPF_PROG_TYPE_CGROUP_SOCK**, * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, * **BPF_PROG_TYPE_CGROUP_SYSCTL**, * **BPF_PROG_TYPE_SOCK_OPS** * * Control Group v2 hierarchy with the eBPF controller * enabled. Requires the kernel to be compiled with * **CONFIG_CGROUP_BPF**. * * **BPF_PROG_TYPE_FLOW_DISSECTOR** * * Network namespace (eg /proc/self/ns/net). * * **BPF_PROG_TYPE_LIRC_MODE2** * * LIRC device path (eg /dev/lircN). Requires the kernel * to be compiled with **CONFIG_BPF_LIRC_MODE2**. * * **BPF_PROG_QUERY** always fetches the number of programs * attached and the *attach_flags* which were used to attach those * programs. Additionally, if *prog_ids* is nonzero and the number * of attached programs is less than *prog_cnt*, populates * *prog_ids* with the eBPF program ids of the programs attached * at *target_fd*. * * The following flags may alter the result: * * **BPF_F_QUERY_EFFECTIVE** * Only return information regarding programs which are * currently effective at the specified *target_fd*. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * BPF_RAW_TRACEPOINT_OPEN * Description * Attach an eBPF program to a tracepoint *name* to access kernel * internal arguments of the tracepoint in their raw form. * * The *prog_fd* must be a valid file descriptor associated with * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**. * * No ABI guarantees are made about the content of tracepoint * arguments exposed to the corresponding eBPF program. * * Applying **close**\ (2) to the file descriptor returned by * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES). * * Return * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * * BPF_BTF_LOAD * Description * Verify and load BPF Type Format (BTF) metadata into the kernel, * returning a new file descriptor associated with the metadata. * BTF is described in more detail at * https://www.kernel.org/doc/html/latest/bpf/btf.html. * * The *btf* parameter must point to valid memory providing * *btf_size* bytes of BTF binary metadata. * * The returned file descriptor can be passed to other **bpf**\ () * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to * associate the BTF with those objects. * * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional * parameters to specify a *btf_log_buf*, *btf_log_size* and * *btf_log_level* which allow the kernel to return freeform log * output regarding the BTF verification process. * * Return * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * * BPF_BTF_GET_FD_BY_ID * Description * Open a file descriptor for the BPF Type Format (BTF) * corresponding to *btf_id*. * * Return * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * * BPF_TASK_FD_QUERY * Description * Obtain information about eBPF programs associated with the * target process identified by *pid* and *fd*. * * If the *pid* and *fd* are associated with a tracepoint, kprobe * or uprobe perf event, then the *prog_id* and *fd_type* will * be populated with the eBPF program id and file descriptor type * of type **bpf_task_fd_type**. If associated with a kprobe or * uprobe, the *probe_offset* and *probe_addr* will also be * populated. Optionally, if *buf* is provided, then up to * *buf_len* bytes of *buf* will be populated with the name of * the tracepoint, kprobe or uprobe. * * The resulting *prog_id* may be introspected in deeper detail * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * BPF_MAP_LOOKUP_AND_DELETE_ELEM * Description * Look up an element with the given *key* in the map referred to * by the file descriptor *fd*, and if found, delete the element. * * For **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map * types, the *flags* argument needs to be set to 0, but for other * map types, it may be specified as: * * **BPF_F_LOCK** * Look up and delete the value of a spin-locked map * without returning the lock. This must be specified if * the elements contain a spinlock. * * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types * implement this command as a "pop" operation, deleting the top * element rather than one corresponding to *key*. * The *key* and *key_len* parameters should be zeroed when * issuing this operation for these map types. * * This command is only valid for the following map types: * * **BPF_MAP_TYPE_QUEUE** * * **BPF_MAP_TYPE_STACK** * * **BPF_MAP_TYPE_HASH** * * **BPF_MAP_TYPE_PERCPU_HASH** * * **BPF_MAP_TYPE_LRU_HASH** * * **BPF_MAP_TYPE_LRU_PERCPU_HASH** * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * BPF_MAP_FREEZE * Description * Freeze the permissions of the specified map. * * Write permissions may be frozen by passing zero *flags*. * Upon success, no future syscall invocations may alter the * map state of *map_fd*. Write operations from eBPF programs * are still possible for a frozen map. * * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * BPF_BTF_GET_NEXT_ID * Description * Fetch the next BPF Type Format (BTF) object currently loaded * into the kernel. * * Looks for the BTF object with an id greater than *start_id* * and updates *next_id* on success. If no other BTF objects * remain with ids higher than *start_id*, returns -1 and sets * *errno* to **ENOENT**. * * Return * Returns zero on success. On error, or when no id remains, -1 * is returned and *errno* is set appropriately. * * BPF_MAP_LOOKUP_BATCH * Description * Iterate and fetch multiple elements in a map. * * Two opaque values are used to manage batch operations, * *in_batch* and *out_batch*. Initially, *in_batch* must be set * to NULL to begin the batched operation. After each subsequent * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant * *out_batch* as the *in_batch* for the next operation to * continue iteration from the current point. * * The *keys* and *values* are output parameters which must point * to memory large enough to hold *count* items based on the key * and value size of the map *map_fd*. The *keys* buffer must be * of *key_size* * *count*. The *values* buffer must be of * *value_size* * *count*. * * The *elem_flags* argument may be specified as one of the * following: * * **BPF_F_LOCK** * Look up the value of a spin-locked map without * returning the lock. This must be specified if the * elements contain a spinlock. * * On success, *count* elements from the map are copied into the * user buffer, with the keys copied into *keys* and the values * copied into the corresponding indices in *values*. * * If an error is returned and *errno* is not **EFAULT**, *count* * is set to the number of successfully processed elements. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * May set *errno* to **ENOSPC** to indicate that *keys* or * *values* is too small to dump an entire bucket during * iteration of a hash-based map type. * * BPF_MAP_LOOKUP_AND_DELETE_BATCH * Description * Iterate and delete all elements in a map. * * This operation has the same behavior as * **BPF_MAP_LOOKUP_BATCH** with two exceptions: * * * Every element that is successfully returned is also deleted * from the map. This is at least *count* elements. Note that * *count* is both an input and an output parameter. * * Upon returning with *errno* set to **EFAULT**, up to * *count* elements may be deleted without returning the keys * and values of the deleted elements. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * BPF_MAP_UPDATE_BATCH * Description * Update multiple elements in a map by *key*. * * The *keys* and *values* are input parameters which must point * to memory large enough to hold *count* items based on the key * and value size of the map *map_fd*. The *keys* buffer must be * of *key_size* * *count*. The *values* buffer must be of * *value_size* * *count*. * * Each element specified in *keys* is sequentially updated to the * value in the corresponding index in *values*. The *in_batch* * and *out_batch* parameters are ignored and should be zeroed. * * The *elem_flags* argument should be specified as one of the * following: * * **BPF_ANY** * Create new elements or update a existing elements. * **BPF_NOEXIST** * Create new elements only if they do not exist. * **BPF_EXIST** * Update existing elements. * **BPF_F_LOCK** * Update spin_lock-ed map elements. This must be * specified if the map value contains a spinlock. * * On success, *count* elements from the map are updated. * * If an error is returned and *errno* is not **EFAULT**, *count* * is set to the number of successfully processed elements. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or * **E2BIG**. **E2BIG** indicates that the number of elements in * the map reached the *max_entries* limit specified at map * creation time. * * May set *errno* to one of the following error codes under * specific circumstances: * * **EEXIST** * If *flags* specifies **BPF_NOEXIST** and the element * with *key* already exists in the map. * **ENOENT** * If *flags* specifies **BPF_EXIST** and the element with * *key* does not exist in the map. * * BPF_MAP_DELETE_BATCH * Description * Delete multiple elements in a map by *key*. * * The *keys* parameter is an input parameter which must point * to memory large enough to hold *count* items based on the key * size of the map *map_fd*, that is, *key_size* * *count*. * * Each element specified in *keys* is sequentially deleted. The * *in_batch*, *out_batch*, and *values* parameters are ignored * and should be zeroed. * * The *elem_flags* argument may be specified as one of the * following: * * **BPF_F_LOCK** * Look up the value of a spin-locked map without * returning the lock. This must be specified if the * elements contain a spinlock. * * On success, *count* elements from the map are updated. * * If an error is returned and *errno* is not **EFAULT**, *count* * is set to the number of successfully processed elements. If * *errno* is **EFAULT**, up to *count* elements may be been * deleted. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * BPF_LINK_CREATE * Description * Attach an eBPF program to a *target_fd* at the specified * *attach_type* hook and return a file descriptor handle for * managing the link. * * Return * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * * BPF_LINK_UPDATE * Description * Update the eBPF program in the specified *link_fd* to * *new_prog_fd*. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * BPF_LINK_GET_FD_BY_ID * Description * Open a file descriptor for the eBPF Link corresponding to * *link_id*. * * Return * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * * BPF_LINK_GET_NEXT_ID * Description * Fetch the next eBPF link currently loaded into the kernel. * * Looks for the eBPF link with an id greater than *start_id* * and updates *next_id* on success. If no other eBPF links * remain with ids higher than *start_id*, returns -1 and sets * *errno* to **ENOENT**. * * Return * Returns zero on success. On error, or when no id remains, -1 * is returned and *errno* is set appropriately. * * BPF_ENABLE_STATS * Description * Enable eBPF runtime statistics gathering. * * Runtime statistics gathering for the eBPF runtime is disabled * by default to minimize the corresponding performance overhead. * This command enables statistics globally. * * Multiple programs may independently enable statistics. * After gathering the desired statistics, eBPF runtime statistics * may be disabled again by calling **close**\ (2) for the file * descriptor returned by this function. Statistics will only be * disabled system-wide when all outstanding file descriptors * returned by prior calls for this subcommand are closed. * * Return * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * * BPF_ITER_CREATE * Description * Create an iterator on top of the specified *link_fd* (as * previously created using **BPF_LINK_CREATE**) and return a * file descriptor that can be used to trigger the iteration. * * If the resulting file descriptor is pinned to the filesystem * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls * for that path will trigger the iterator to read kernel state * using the eBPF program attached to *link_fd*. * * Return * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * * BPF_LINK_DETACH * Description * Forcefully detach the specified *link_fd* from its * corresponding attachment point. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * BPF_PROG_BIND_MAP * Description * Bind a map to the lifetime of an eBPF program. * * The map identified by *map_fd* is bound to the program * identified by *prog_fd* and only released when *prog_fd* is * released. This may be used in cases where metadata should be * associated with a program which otherwise does not contain any * references to the map (for example, embedded in the eBPF * program instructions). * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * * NOTES * eBPF objects (maps and programs) can be shared between processes. * * * After **fork**\ (2), the child inherits file descriptors * referring to the same eBPF objects. * * File descriptors referring to eBPF objects can be transferred over * **unix**\ (7) domain sockets. * * File descriptors referring to eBPF objects can be duplicated in the * usual way, using **dup**\ (2) and similar calls. * * File descriptors referring to eBPF objects can be pinned to the * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2). * * An eBPF object is deallocated only after all file descriptors referring * to the object have been closed and no references remain pinned to the * filesystem or attached (for example, bound to a program or device). */ enum bpf_cmd { BPF_MAP_CREATE, BPF_MAP_LOOKUP_ELEM, BPF_MAP_UPDATE_ELEM, BPF_MAP_DELETE_ELEM, BPF_MAP_GET_NEXT_KEY, BPF_PROG_LOAD, BPF_OBJ_PIN, BPF_OBJ_GET, BPF_PROG_ATTACH, BPF_PROG_DETACH, BPF_PROG_TEST_RUN, BPF_PROG_RUN = BPF_PROG_TEST_RUN, BPF_PROG_GET_NEXT_ID, BPF_MAP_GET_NEXT_ID, BPF_PROG_GET_FD_BY_ID, BPF_MAP_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, BPF_PROG_QUERY, BPF_RAW_TRACEPOINT_OPEN, BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, BPF_MAP_LOOKUP_AND_DELETE_ELEM, BPF_MAP_FREEZE, BPF_BTF_GET_NEXT_ID, BPF_MAP_LOOKUP_BATCH, BPF_MAP_LOOKUP_AND_DELETE_BATCH, BPF_MAP_UPDATE_BATCH, BPF_MAP_DELETE_BATCH, BPF_LINK_CREATE, BPF_LINK_UPDATE, BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, BPF_ENABLE_STATS, BPF_ITER_CREATE, BPF_LINK_DETACH, BPF_PROG_BIND_MAP, }; enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_MAP_TYPE_PERCPU_HASH, BPF_MAP_TYPE_PERCPU_ARRAY, BPF_MAP_TYPE_STACK_TRACE, BPF_MAP_TYPE_CGROUP_ARRAY, BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_MAP_TYPE_LPM_TRIE, BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, /* BPF_MAP_TYPE_CGROUP_STORAGE is available to bpf programs attaching * to a cgroup. The newer BPF_MAP_TYPE_CGRP_STORAGE is available to * both cgroup-attached and other progs and supports all functionality * provided by BPF_MAP_TYPE_CGROUP_STORAGE. So mark * BPF_MAP_TYPE_CGROUP_STORAGE deprecated. */ BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, }; /* Note that tracing related programs such as * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} * are not subject to a stable API since kernel internal data * structures can change from release to release and may * therefore break existing tracing BPF programs. Tracing BPF * programs correspond to /a/ specific kernel which is to be * analyzed, and not /a/ specific kernel /and/ all future ones. */ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, BPF_PROG_TYPE_KPROBE, BPF_PROG_TYPE_SCHED_CLS, BPF_PROG_TYPE_SCHED_ACT, BPF_PROG_TYPE_TRACEPOINT, BPF_PROG_TYPE_XDP, BPF_PROG_TYPE_PERF_EVENT, BPF_PROG_TYPE_CGROUP_SKB, BPF_PROG_TYPE_CGROUP_SOCK, BPF_PROG_TYPE_LWT_IN, BPF_PROG_TYPE_LWT_OUT, BPF_PROG_TYPE_LWT_XMIT, BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_PROG_TYPE_TRACING, BPF_PROG_TYPE_STRUCT_OPS, BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ }; enum bpf_attach_type { BPF_CGROUP_INET_INGRESS, BPF_CGROUP_INET_EGRESS, BPF_CGROUP_INET_SOCK_CREATE, BPF_CGROUP_SOCK_OPS, BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_DEVICE, BPF_SK_MSG_VERDICT, BPF_CGROUP_INET4_BIND, BPF_CGROUP_INET6_BIND, BPF_CGROUP_INET4_CONNECT, BPF_CGROUP_INET6_CONNECT, BPF_CGROUP_INET4_POST_BIND, BPF_CGROUP_INET6_POST_BIND, BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP6_SENDMSG, BPF_LIRC_MODE2, BPF_FLOW_DISSECTOR, BPF_CGROUP_SYSCTL, BPF_CGROUP_UDP4_RECVMSG, BPF_CGROUP_UDP6_RECVMSG, BPF_CGROUP_GETSOCKOPT, BPF_CGROUP_SETSOCKOPT, BPF_TRACE_RAW_TP, BPF_TRACE_FENTRY, BPF_TRACE_FEXIT, BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, BPF_CGROUP_INET4_GETPEERNAME, BPF_CGROUP_INET6_GETPEERNAME, BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, BPF_SK_SKB_VERDICT, BPF_SK_REUSEPORT_SELECT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, BPF_PERF_EVENT, BPF_TRACE_KPROBE_MULTI, BPF_LSM_CGROUP, __MAX_BPF_ATTACH_TYPE }; #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE enum bpf_link_type { BPF_LINK_TYPE_UNSPEC = 0, BPF_LINK_TYPE_RAW_TRACEPOINT = 1, BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_ITER = 4, BPF_LINK_TYPE_NETNS = 5, BPF_LINK_TYPE_XDP = 6, BPF_LINK_TYPE_PERF_EVENT = 7, BPF_LINK_TYPE_KPROBE_MULTI = 8, BPF_LINK_TYPE_STRUCT_OPS = 9, MAX_BPF_LINK_TYPE, }; /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. * * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, * the program in this cgroup yields to sub-cgroup program. * * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, * that cgroup program gets run in addition to the program in this cgroup. * * Only one program is allowed to be attached to a cgroup with * NONE or BPF_F_ALLOW_OVERRIDE flag. * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will * release old program and attach the new one. Attach flags has to match. * * Multiple programs are allowed to be attached to a cgroup with * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order * (those that were attached first, run first) * The programs of sub-cgroup are executed first, then programs of * this cgroup and then programs of parent cgroup. * When children program makes decision (like picking TCP CA or sock bind) * parent program has a chance to override it. * * With BPF_F_ALLOW_MULTI a new program is added to the end of the list of * programs for a cgroup. Though it's possible to replace an old program at * any position by also specifying BPF_F_REPLACE flag and position itself in * replace_bpf_fd attribute. Old program at this position will be released. * * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. * A cgroup with NONE doesn't allow any programs in sub-cgroups. * Ex1: * cgrp1 (MULTI progs A, B) -> * cgrp2 (OVERRIDE prog C) -> * cgrp3 (MULTI prog D) -> * cgrp4 (OVERRIDE prog E) -> * cgrp5 (NONE prog F) * the event in cgrp5 triggers execution of F,D,A,B in that order. * if prog F is detached, the execution is E,D,A,B * if prog F and D are detached, the execution is E,A,B * if prog F, E and D are detached, the execution is C,A,B * * All eligible programs are executed regardless of return code from * earlier programs. */ #define BPF_F_ALLOW_OVERRIDE (1U << 0) #define BPF_F_ALLOW_MULTI (1U << 1) #define BPF_F_REPLACE (1U << 2) /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the * verifier will perform strict alignment checking as if the kernel * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set, * and NET_IP_ALIGN defined to 2. */ #define BPF_F_STRICT_ALIGNMENT (1U << 0) /* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the * verifier will allow any alignment whatsoever. On platforms * with strict alignment requirements for loads ands stores (such * as sparc and mips) the verifier validates that all loads and * stores provably follow this requirement. This flag turns that * checking and enforcement off. * * It is mostly used for testing when we want to validate the * context and memory access aspects of the verifier, but because * of an unaligned access the alignment check would trigger before * the one we are interested in. */ #define BPF_F_ANY_ALIGNMENT (1U << 1) /* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose. * Verifier does sub-register def/use analysis and identifies instructions whose * def only matters for low 32-bit, high 32-bit is never referenced later * through implicit zero extension. Therefore verifier notifies JIT back-ends * that it is safe to ignore clearing high 32-bit for these instructions. This * saves some back-ends a lot of code-gen. However such optimization is not * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends * hence hasn't used verifier's analysis result. But, we really want to have a * way to be able to verify the correctness of the described optimization on * x86_64 on which testsuites are frequently exercised. * * So, this flag is introduced. Once it is set, verifier will randomize high * 32-bit for those instructions who has been identified as safe to ignore them. * Then, if verifier is not doing correct analysis, such randomization will * regress tests to expose bugs. */ #define BPF_F_TEST_RND_HI32 (1U << 2) /* The verifier internal test flag. Behavior is undefined */ #define BPF_F_TEST_STATE_FREQ (1U << 3) /* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will * restrict map and helper usage for such programs. Sleepable BPF programs can * only be attached to hooks where kernel execution context allows sleeping. * Such programs are allowed to use helpers that may sleep like * bpf_copy_from_user(). */ #define BPF_F_SLEEPABLE (1U << 4) /* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program * fully support xdp frags. */ #define BPF_F_XDP_HAS_FRAGS (1U << 5) /* If BPF_F_XDP_DEV_BOUND_ONLY is used in BPF_PROG_LOAD command, the loaded * program becomes device-bound but can access XDP metadata. */ #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. */ #define BPF_F_KPROBE_MULTI_RETURN (1U << 0) /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * the following extensions: * * insn[0].src_reg: BPF_PSEUDO_MAP_[FD|IDX] * insn[0].imm: map fd or fd_idx * insn[1].imm: 0 * insn[0].off: 0 * insn[1].off: 0 * ldimm64 rewrite: address of map * verifier type: CONST_PTR_TO_MAP */ #define BPF_PSEUDO_MAP_FD 1 #define BPF_PSEUDO_MAP_IDX 5 /* insn[0].src_reg: BPF_PSEUDO_MAP_[IDX_]VALUE * insn[0].imm: map fd or fd_idx * insn[1].imm: offset into value * insn[0].off: 0 * insn[1].off: 0 * ldimm64 rewrite: address of map[0]+offset * verifier type: PTR_TO_MAP_VALUE */ #define BPF_PSEUDO_MAP_VALUE 2 #define BPF_PSEUDO_MAP_IDX_VALUE 6 /* insn[0].src_reg: BPF_PSEUDO_BTF_ID * insn[0].imm: kernel btd id of VAR * insn[1].imm: 0 * insn[0].off: 0 * insn[1].off: 0 * ldimm64 rewrite: address of the kernel variable * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var * is struct/union. */ #define BPF_PSEUDO_BTF_ID 3 /* insn[0].src_reg: BPF_PSEUDO_FUNC * insn[0].imm: insn offset to the func * insn[1].imm: 0 * insn[0].off: 0 * insn[1].off: 0 * ldimm64 rewrite: address of the function * verifier type: PTR_TO_FUNC. */ #define BPF_PSEUDO_FUNC 4 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function */ #define BPF_PSEUDO_CALL 1 /* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel */ #define BPF_PSEUDO_KFUNC_CALL 2 /* flags for BPF_MAP_UPDATE_ELEM command */ enum { BPF_ANY = 0, /* create new element or update existing */ BPF_NOEXIST = 1, /* create new element if it didn't exist */ BPF_EXIST = 2, /* update existing element */ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ }; /* flags for BPF_MAP_CREATE command */ enum { BPF_F_NO_PREALLOC = (1U << 0), /* Instead of having one common LRU list in the * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list * which can scale and perform better. * Note, the LRU nodes (including free nodes) cannot be moved * across different LRU lists. */ BPF_F_NO_COMMON_LRU = (1U << 1), /* Specify numa node during map creation */ BPF_F_NUMA_NODE = (1U << 2), /* Flags for accessing BPF object from syscall side. */ BPF_F_RDONLY = (1U << 3), BPF_F_WRONLY = (1U << 4), /* Flag for stack_map, store build_id+offset instead of pointer */ BPF_F_STACK_BUILD_ID = (1U << 5), /* Zero-initialize hash function seed. This should only be used for testing. */ BPF_F_ZERO_SEED = (1U << 6), /* Flags for accessing BPF object from program side. */ BPF_F_RDONLY_PROG = (1U << 7), BPF_F_WRONLY_PROG = (1U << 8), /* Clone map from listener for newly accepted socket */ BPF_F_CLONE = (1U << 9), /* Enable memory-mapping BPF map */ BPF_F_MMAPABLE = (1U << 10), /* Share perf_event among processes */ BPF_F_PRESERVE_ELEMS = (1U << 11), /* Create a map that is suitable to be an inner map with dynamic max entries */ BPF_F_INNER_MAP = (1U << 12), }; /* Flags for BPF_PROG_QUERY. */ /* Query effective (directly attached + inherited from ancestor cgroups) * programs that will be executed for events within a cgroup. * attach_flags with this flag are always returned 0. */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) /* Flags for BPF_PROG_TEST_RUN */ /* If set, run the test on the cpu specified by bpf_attr.test.cpu */ #define BPF_F_TEST_RUN_ON_CPU (1U << 0) /* If set, XDP frames will be transmitted after processing */ #define BPF_F_TEST_XDP_LIVE_FRAMES (1U << 1) /* type for BPF_ENABLE_STATS */ enum bpf_stats_type { /* enabled run_time_ns and run_cnt */ BPF_STATS_RUN_TIME = 0, }; enum bpf_stack_build_id_status { /* user space need an empty entry to identify end of a trace */ BPF_STACK_BUILD_ID_EMPTY = 0, /* with valid build_id and offset */ BPF_STACK_BUILD_ID_VALID = 1, /* couldn't get build_id, fallback to ip */ BPF_STACK_BUILD_ID_IP = 2, }; #define BPF_BUILD_ID_SIZE 20 struct bpf_stack_build_id { __s32 status; unsigned char build_id[BPF_BUILD_ID_SIZE]; union { __u64 offset; __u64 ip; }; }; #define BPF_OBJ_NAME_LEN 16U union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ __u32 key_size; /* size of key in bytes */ __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ __u32 map_flags; /* BPF_MAP_CREATE related * flags defined above. */ __u32 inner_map_fd; /* fd pointing to the inner map */ __u32 numa_node; /* numa node (effective only if * BPF_F_NUMA_NODE is set). */ char map_name[BPF_OBJ_NAME_LEN]; __u32 map_ifindex; /* ifindex of netdev to create on */ __u32 btf_fd; /* fd pointing to a BTF type data */ __u32 btf_key_type_id; /* BTF type_id of the key */ __u32 btf_value_type_id; /* BTF type_id of the value */ __u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel- * struct stored as the * map value */ /* Any per-map-type extra fields * * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the * number of hash functions (if 0, the bloom filter will default * to using 5 hash functions). */ __u64 map_extra; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ __u32 map_fd; __aligned_u64 key; union { __aligned_u64 value; __aligned_u64 next_key; }; __u64 flags; }; struct { /* struct used by BPF_MAP_*_BATCH commands */ __aligned_u64 in_batch; /* start batch, * NULL to start from beginning */ __aligned_u64 out_batch; /* output: next start batch */ __aligned_u64 keys; __aligned_u64 values; __u32 count; /* input/output: * input: # of key/value * elements * output: # of filled elements */ __u32 map_fd; __u64 elem_flags; __u64 flags; } batch; struct { /* anonymous struct used by BPF_PROG_LOAD command */ __u32 prog_type; /* one of enum bpf_prog_type */ __u32 insn_cnt; __aligned_u64 insns; __aligned_u64 license; __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* not used */ __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ /* For some prog types expected attach type must be known at * load time to verify attach type specific parts of prog * (context accesses, allowed helpers, etc). */ __u32 expected_attach_type; __u32 prog_btf_fd; /* fd pointing to BTF type data */ __u32 func_info_rec_size; /* userspace bpf_func_info size */ __aligned_u64 func_info; /* func info */ __u32 func_info_cnt; /* number of bpf_func_info records */ __u32 line_info_rec_size; /* userspace bpf_line_info size */ __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ union { /* valid prog_fd to attach to bpf prog */ __u32 attach_prog_fd; /* or valid module BTF object fd or 0 to attach to vmlinux */ __u32 attach_btf_obj_fd; }; __u32 core_relo_cnt; /* number of bpf_core_relo */ __aligned_u64 fd_array; /* array of FDs */ __aligned_u64 core_relos; __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ __aligned_u64 pathname; __u32 bpf_fd; __u32 file_flags; }; struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ __u32 target_fd; /* container object to attach to */ __u32 attach_bpf_fd; /* eBPF program to attach */ __u32 attach_type; __u32 attach_flags; __u32 replace_bpf_fd; /* previously attached eBPF * program to replace if * BPF_F_REPLACE is used */ }; struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ __u32 prog_fd; __u32 retval; __u32 data_size_in; /* input: len of data_in */ __u32 data_size_out; /* input/output: len of data_out * returns ENOSPC if data_out * is too small. */ __aligned_u64 data_in; __aligned_u64 data_out; __u32 repeat; __u32 duration; __u32 ctx_size_in; /* input: len of ctx_in */ __u32 ctx_size_out; /* input/output: len of ctx_out * returns ENOSPC if ctx_out * is too small. */ __aligned_u64 ctx_in; __aligned_u64 ctx_out; __u32 flags; __u32 cpu; __u32 batch_size; } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ union { __u32 start_id; __u32 prog_id; __u32 map_id; __u32 btf_id; __u32 link_id; }; __u32 next_id; __u32 open_flags; }; struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ __u32 bpf_fd; __u32 info_len; __aligned_u64 info; } info; struct { /* anonymous struct used by BPF_PROG_QUERY command */ __u32 target_fd; /* container object to query */ __u32 attach_type; __u32 query_flags; __u32 attach_flags; __aligned_u64 prog_ids; __u32 prog_cnt; /* output: per-program attach_flags. * not allowed to be set during effective query. */ __aligned_u64 prog_attach_flags; } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ __u64 name; __u32 prog_fd; } raw_tracepoint; struct { /* anonymous struct for BPF_BTF_LOAD */ __aligned_u64 btf; __aligned_u64 btf_log_buf; __u32 btf_size; __u32 btf_log_size; __u32 btf_log_level; }; struct { __u32 pid; /* input: pid */ __u32 fd; /* input: fd */ __u32 flags; /* input: flags */ __u32 buf_len; /* input/output: buf len */ __aligned_u64 buf; /* input/output: * tp_name for tracepoint * symbol for kprobe * filename for uprobe */ __u32 prog_id; /* output: prod_id */ __u32 fd_type; /* output: BPF_FD_TYPE_* */ __u64 probe_offset; /* output: probe_offset */ __u64 probe_addr; /* output: probe_addr */ } task_fd_query; struct { /* struct used by BPF_LINK_CREATE command */ __u32 prog_fd; /* eBPF program to attach */ union { __u32 target_fd; /* object to attach to */ __u32 target_ifindex; /* target ifindex */ }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ union { __u32 target_btf_id; /* btf_id of target to attach to */ struct { __aligned_u64 iter_info; /* extra bpf_iter_link_info */ __u32 iter_info_len; /* iter_info length */ }; struct { /* black box user-provided value passed through * to BPF program at the execution time and * accessible through bpf_get_attach_cookie() BPF helper */ __u64 bpf_cookie; } perf_event; struct { __u32 flags; __u32 cnt; __aligned_u64 syms; __aligned_u64 addrs; __aligned_u64 cookies; } kprobe_multi; struct { /* this is overlaid with the target_btf_id above. */ __u32 target_btf_id; /* black box user-provided value passed through * to BPF program at the execution time and * accessible through bpf_get_attach_cookie() BPF helper */ __u64 cookie; } tracing; }; } link_create; struct { /* struct used by BPF_LINK_UPDATE command */ __u32 link_fd; /* link fd */ /* new program fd to update link with */ __u32 new_prog_fd; __u32 flags; /* extra flags */ /* expected link's program fd; is specified only if * BPF_F_REPLACE flag is set in flags */ __u32 old_prog_fd; } link_update; struct { __u32 link_fd; } link_detach; struct { /* struct used by BPF_ENABLE_STATS command */ __u32 type; } enable_stats; struct { /* struct used by BPF_ITER_CREATE command */ __u32 link_fd; __u32 flags; } iter_create; struct { /* struct used by BPF_PROG_BIND_MAP command */ __u32 prog_fd; __u32 map_fd; __u32 flags; /* extra flags */ } prog_bind_map; } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF * developers about the multiple available eBPF helper functions. It can be * parsed and used to produce a manual page. The workflow is the following, * and requires the rst2man utility: * * $ ./scripts/bpf_doc.py \ * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 * $ man /tmp/bpf-helpers.7 * * Note that in order to produce this external documentation, some RST * formatting is used in the descriptions to get "bold" and "italics" in * manual pages. Also note that the few trailing white spaces are * intentional, removing them would break paragraphs for rst2man. * * Start of BPF helper function descriptions: * * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) * Description * Perform a lookup in *map* for an entry associated to *key*. * Return * Map value associated to *key*, or **NULL** if no entry was * found. * * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) * Description * Add or update the value of the entry associated to *key* in * *map* with *value*. *flags* is one of: * * **BPF_NOEXIST** * The entry for *key* must not exist in the map. * **BPF_EXIST** * The entry for *key* must already exist in the map. * **BPF_ANY** * No condition on the existence of the entry for *key*. * * Flag value **BPF_NOEXIST** cannot be used for maps of types * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all * elements always exist), the helper would return an error. * Return * 0 on success, or a negative error in case of failure. * * long bpf_map_delete_elem(struct bpf_map *map, const void *key) * Description * Delete entry with *key* from *map*. * Return * 0 on success, or a negative error in case of failure. * * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) * Description * For tracing programs, safely attempt to read *size* bytes from * kernel space address *unsafe_ptr* and store the data in *dst*. * * Generally, use **bpf_probe_read_user**\ () or * **bpf_probe_read_kernel**\ () instead. * Return * 0 on success, or a negative error in case of failure. * * u64 bpf_ktime_get_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. * Does not include time the system was suspended. * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) * Return * Current *ktime*. * * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...) * Description * This helper is a "printk()-like" facility for debugging. It * prints a message defined by format *fmt* (of size *fmt_size*) * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if * available. It can take up to three additional **u64** * arguments (as an eBPF helpers, the total number of arguments is * limited to five). * * Each time the helper is called, it appends a line to the trace. * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this. * The format of the trace is customizable, and the exact output * one will get depends on the options set in * *\/sys/kernel/debug/tracing/trace_options* (see also the * *README* file under the same directory). However, it usually * defaults to something like: * * :: * * telnet-470 [001] .N.. 419421.045894: 0x00000001: * * In the above: * * * ``telnet`` is the name of the current task. * * ``470`` is the PID of the current task. * * ``001`` is the CPU number on which the task is * running. * * In ``.N..``, each character refers to a set of * options (whether irqs are enabled, scheduling * options, whether hard/softirqs are running, level of * preempt_disabled respectively). **N** means that * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** * are set. * * ``419421.045894`` is a timestamp. * * ``0x00000001`` is a fake value used by BPF for the * instruction pointer register. * * ```` is the message formatted with * *fmt*. * * The conversion specifiers supported by *fmt* are similar, but * more limited than for printk(). They are **%d**, **%i**, * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size * of field, padding with zeroes, etc.) is available, and the * helper will return **-EINVAL** (but print nothing) if it * encounters an unknown specifier. * * Also, note that **bpf_trace_printk**\ () is slow, and should * only be used for debugging purposes. For this reason, a notice * block (spanning several lines) is printed to kernel logs and * states that the helper should not be used "for production use" * the first time this helper is used (or more precisely, when * **trace_printk**\ () buffers are allocated). For passing values * to user space, perf events should be preferred. * Return * The number of bytes written to the buffer, or a negative error * in case of failure. * * u32 bpf_get_prandom_u32(void) * Description * Get a pseudo-random number. * * From a security point of view, this helper uses its own * pseudo-random internal state, and cannot be used to infer the * seed of other random functions in the kernel. However, it is * essential to note that the generator used by the helper is not * cryptographically secure. * Return * A random 32-bit unsigned value. * * u32 bpf_get_smp_processor_id(void) * Description * Get the SMP (symmetric multiprocessing) processor id. Note that * all programs run with migration disabled, which means that the * SMP processor id is stable during all the execution of the * program. * Return * The SMP id of the processor running the program. * * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. *flags* are a combination of * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the * checksum for the packet after storing the bytes) and * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ * **->swhash** and *skb*\ **->l4hash** to 0). * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) * Description * Recompute the layer 3 (e.g. IP) checksum for the packet * associated to *skb*. Computation is incremental, so the helper * must know the former value of the header field that was * modified (*from*), the new value of this field (*to*), and the * number of bytes (2 or 4) for this field, stored in *size*. * Alternatively, it is possible to store the difference between * the previous and the new values of the header field in *to*, by * setting *from* and *size* to 0. For both methods, *offset* * indicates the location of the IP checksum within the packet. * * This helper works in combination with **bpf_csum_diff**\ (), * which does not update the checksum in-place, but offers more * flexibility and can handle sizes larger than 2 or 4 for the * checksum to update. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) * Description * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the * packet associated to *skb*. Computation is incremental, so the * helper must know the former value of the header field that was * modified (*from*), the new value of this field (*to*), and the * number of bytes (2 or 4) for this field, stored on the lowest * four bits of *flags*. Alternatively, it is possible to store * the difference between the previous and the new values of the * header field in *to*, by setting *from* and the four lowest * bits of *flags* to 0. For both methods, *offset* indicates the * location of the IP checksum within the packet. In addition to * the size of the field, *flags* can be added (bitwise OR) actual * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and * for updates resulting in a null checksum the value is set to * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates * the checksum is to be computed against a pseudo-header. * * This helper works in combination with **bpf_csum_diff**\ (), * which does not update the checksum in-place, but offers more * flexibility and can handle sizes larger than 2 or 4 for the * checksum to update. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) * Description * This special helper is used to trigger a "tail call", or in * other words, to jump into another eBPF program. The same stack * frame is used (but values on stack and in registers for the * caller are not accessible to the callee). This mechanism allows * for program chaining, either for raising the maximum number of * available eBPF instructions, or to execute given programs in * conditional blocks. For security reasons, there is an upper * limit to the number of successive tail calls that can be * performed. * * Upon call of this helper, the program attempts to jump into a * program referenced at index *index* in *prog_array_map*, a * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes * *ctx*, a pointer to the context. * * If the call succeeds, the kernel immediately runs the first * instruction of the new program. This is not a function call, * and it never returns to the previous program. If the call * fails, then the helper has no effect, and the caller continues * to run its subsequent instructions. A call can fail if the * destination program for the jump does not exist (i.e. *index* * is superior to the number of entries in *prog_array_map*), or * if the maximum number of tail calls has been reached for this * chain of programs. This limit is defined in the kernel by the * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), * which is currently set to 33. * Return * 0 on success, or a negative error in case of failure. * * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) * Description * Clone and redirect the packet associated to *skb* to another * net device of index *ifindex*. Both ingress and egress * interfaces can be used for redirection. The **BPF_F_INGRESS** * value in *flags* is used to make the distinction (ingress path * is selected if the flag is present, egress path otherwise). * This is the only flag supported for now. * * In comparison with **bpf_redirect**\ () helper, * **bpf_clone_redirect**\ () has the associated cost of * duplicating the packet buffer, but this can be executed out of * the eBPF program. Conversely, **bpf_redirect**\ () is more * efficient, but it is handled through an action code where the * redirection happens only after the eBPF program has returned. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * u64 bpf_get_current_pid_tgid(void) * Description * Get the current pid and tgid. * Return * A 64-bit integer containing the current tgid and pid, and * created as such: * *current_task*\ **->tgid << 32 \|** * *current_task*\ **->pid**. * * u64 bpf_get_current_uid_gid(void) * Description * Get the current uid and gid. * Return * A 64-bit integer containing the current GID and UID, and * created as such: *current_gid* **<< 32 \|** *current_uid*. * * long bpf_get_current_comm(void *buf, u32 size_of_buf) * Description * Copy the **comm** attribute of the current task into *buf* of * *size_of_buf*. The **comm** attribute contains the name of * the executable (excluding the path) for the current task. The * *size_of_buf* must be strictly positive. On success, the * helper makes sure that the *buf* is NUL-terminated. On failure, * it is filled with zeroes. * Return * 0 on success, or a negative error in case of failure. * * u32 bpf_get_cgroup_classid(struct sk_buff *skb) * Description * Retrieve the classid for the current task, i.e. for the net_cls * cgroup to which *skb* belongs. * * This helper can be used on TC egress path, but not on ingress. * * The net_cls cgroup provides an interface to tag network packets * based on a user-provided identifier for all traffic coming from * the tasks belonging to the related cgroup. See also the related * kernel documentation, available from the Linux sources in file * *Documentation/admin-guide/cgroup-v1/net_cls.rst*. * * The Linux kernel has two versions for cgroups: there are * cgroups v1 and cgroups v2. Both are available to users, who can * use a mixture of them, but note that the net_cls cgroup is for * cgroup v1 only. This makes it incompatible with BPF programs * run on cgroups, which is a cgroup-v2-only feature (a socket can * only hold data for one version of cgroups at a time). * * This helper is only available is the kernel was compiled with * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to * "**y**" or to "**m**". * Return * The classid, or 0 for the default unconfigured classid. * * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) * Description * Push a *vlan_tci* (VLAN tag control information) of protocol * *vlan_proto* to the packet associated to *skb*, then update * the checksum. Note that if *vlan_proto* is different from * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to * be **ETH_P_8021Q**. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_skb_vlan_pop(struct sk_buff *skb) * Description * Pop a VLAN header from the packet associated to *skb*. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Get tunnel metadata. This helper takes a pointer *key* to an * empty **struct bpf_tunnel_key** of **size**, that will be * filled with tunnel metadata for the packet associated to *skb*. * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which * indicates that the tunnel is based on IPv6 protocol instead of * IPv4. * * The **struct bpf_tunnel_key** is an object that generalizes the * principal parameters used by various tunneling protocols into a * single struct. This way, it can be used to easily make a * decision based on the contents of the encapsulation header, * "summarized" in this struct. In particular, it holds the IP * address of the remote end (IPv4 or IPv6, depending on the case) * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, * this struct exposes the *key*\ **->tunnel_id**, which is * generally mapped to a VNI (Virtual Network Identifier), making * it programmable together with the **bpf_skb_set_tunnel_key**\ * () helper. * * Let's imagine that the following code is part of a program * attached to the TC ingress interface, on one end of a GRE * tunnel, and is supposed to filter out all messages coming from * remote ends with IPv4 address other than 10.0.0.1: * * :: * * int ret; * struct bpf_tunnel_key key = {}; * * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); * if (ret < 0) * return TC_ACT_SHOT; // drop packet * * if (key.remote_ipv4 != 0x0a000001) * return TC_ACT_SHOT; // drop packet * * return TC_ACT_OK; // accept packet * * This interface can also be used with all encapsulation devices * that can operate in "collect metadata" mode: instead of having * one network device per specific configuration, the "collect * metadata" mode only requires a single device where the * configuration can be extracted from this helper. * * This can be used together with various tunnels such as VXLan, * Geneve, GRE or IP in IP (IPIP). * Return * 0 on success, or a negative error in case of failure. * * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Populate tunnel metadata for packet associated to *skb.* The * tunnel metadata is set to the contents of *key*, of *size*. The * *flags* can be set to a combination of the following values: * * **BPF_F_TUNINFO_IPV6** * Indicate that the tunnel is based on IPv6 protocol * instead of IPv4. * **BPF_F_ZERO_CSUM_TX** * For IPv4 packets, add a flag to tunnel metadata * indicating that checksum computation should be skipped * and checksum set to zeroes. * **BPF_F_DONT_FRAGMENT** * Add a flag to tunnel metadata indicating that the * packet should not be fragmented. * **BPF_F_SEQ_NUMBER** * Add a flag to tunnel metadata indicating that a * sequence number should be added to tunnel header before * sending the packet. This flag was added for GRE * encapsulation, but might be used with other protocols * as well in the future. * **BPF_F_NO_TUNNEL_KEY** * Add a flag to tunnel metadata indicating that no tunnel * key should be set in the resulting tunnel header. * * Here is a typical usage on the transmit path: * * :: * * struct bpf_tunnel_key key; * populate key ... * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); * * See also the description of the **bpf_skb_get_tunnel_key**\ () * helper for additional information. * Return * 0 on success, or a negative error in case of failure. * * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags) * Description * Read the value of a perf event counter. This helper relies on a * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of * the perf event counter is selected when *map* is updated with * perf event file descriptors. The *map* is an array whose size * is the number of available CPUs, and each cell contains a value * relative to one CPU. The value to retrieve is indicated by * *flags*, that contains the index of the CPU to look up, masked * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to * **BPF_F_CURRENT_CPU** to indicate that the value for the * current CPU should be retrieved. * * Note that before Linux 4.13, only hardware perf event can be * retrieved. * * Also, be aware that the newer helper * **bpf_perf_event_read_value**\ () is recommended over * **bpf_perf_event_read**\ () in general. The latter has some ABI * quirks where error and counter value are used as a return code * (which is wrong to do since ranges may overlap). This issue is * fixed with **bpf_perf_event_read_value**\ (), which at the same * time provides more features over the **bpf_perf_event_read**\ * () interface. Please refer to the description of * **bpf_perf_event_read_value**\ () for details. * Return * The value of the perf event counter read from the map, or a * negative error code in case of failure. * * long bpf_redirect(u32 ifindex, u64 flags) * Description * Redirect the packet to another net device of index *ifindex*. * This helper is somewhat similar to **bpf_clone_redirect**\ * (), except that the packet is not cloned, which provides * increased performance. * * Except for XDP, both ingress and egress interfaces can be used * for redirection. The **BPF_F_INGRESS** value in *flags* is used * to make the distinction (ingress path is selected if the flag * is present, egress path otherwise). Currently, XDP only * supports redirection to the egress interface, and accepts no * flag at all. * * The same effect can also be attained with the more generic * **bpf_redirect_map**\ (), which uses a BPF map to store the * redirect target instead of providing it directly to the helper. * Return * For XDP, the helper returns **XDP_REDIRECT** on success or * **XDP_ABORTED** on error. For other program types, the values * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on * error. * * u32 bpf_get_route_realm(struct sk_buff *skb) * Description * Retrieve the realm or the route, that is to say the * **tclassid** field of the destination for the *skb*. The * identifier retrieved is a user-provided tag, similar to the * one used with the net_cls cgroup (see description for * **bpf_get_cgroup_classid**\ () helper), but here this tag is * held by a route (a destination entry), not by a task. * * Retrieving this identifier works with the clsact TC egress hook * (see also **tc-bpf(8)**), or alternatively on conventional * classful egress qdiscs, but not on TC ingress path. In case of * clsact TC egress hook, this has the advantage that, internally, * the destination entry has not been dropped yet in the transmit * path. Therefore, the destination entry does not need to be * artificially held via **netif_keep_dst**\ () for a classful * qdisc until the *skb* is freed. * * This helper is available only if the kernel was compiled with * **CONFIG_IP_ROUTE_CLASSID** configuration option. * Return * The realm of the route for the packet associated to *skb*, or 0 * if none was found. * * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf * event must have the following attributes: **PERF_SAMPLE_RAW** * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. * * The *flags* are used to indicate the index in *map* for which * the value must be put, masked with **BPF_F_INDEX_MASK**. * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** * to indicate that the index of the current CPU core should be * used. * * The value to write, of *size*, is passed through eBPF stack and * pointed by *data*. * * The context of the program *ctx* needs also be passed to the * helper. * * On user space, a program willing to read the values needs to * call **perf_event_open**\ () on the perf event (either for * one or for all CPUs) and to store the file descriptor into the * *map*. This must be done before the eBPF program can send data * into it. An example is available in file * *samples/bpf/trace_output_user.c* in the Linux kernel source * tree (the eBPF program counterpart is in * *samples/bpf/trace_output_kern.c*). * * **bpf_perf_event_output**\ () achieves better performance * than **bpf_trace_printk**\ () for sharing data with user * space, and is much better suitable for streaming data from eBPF * programs. * * Note that this helper is not restricted to tracing use cases * and can be used with programs attached to TC or XDP as well, * where it allows for passing data to user space listeners. Data * can be: * * * Only custom structs, * * Only the packet payload, or * * A combination of both. * Return * 0 on success, or a negative error in case of failure. * * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) * Description * This helper was provided as an easy way to load data from a * packet. It can be used to load *len* bytes from *offset* from * the packet associated to *skb*, into the buffer pointed by * *to*. * * Since Linux 4.7, usage of this helper has mostly been replaced * by "direct packet access", enabling packet data to be * manipulated with *skb*\ **->data** and *skb*\ **->data_end** * pointing respectively to the first byte of packet data and to * the byte after the last byte of packet data. However, it * remains useful if one wishes to read large quantities of data * at once from a packet into the eBPF stack. * Return * 0 on success, or a negative error in case of failure. * * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) * Description * Walk a user or a kernel stack and return its id. To achieve * this, the helper needs *ctx*, which is a pointer to the context * on which the tracing program is executed, and a pointer to a * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. * * The last argument, *flags*, holds the number of stack frames to * skip (from 0 to 255), masked with * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set * a combination of the following flags: * * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. * **BPF_F_FAST_STACK_CMP** * Compare stacks by hash only. * **BPF_F_REUSE_STACKID** * If two different stacks hash into the same *stackid*, * discard the old one. * * The stack id retrieved is a 32 bit long integer handle which * can be further combined with other data (including other stack * ids) and used as a key into maps. This can be useful for * generating a variety of graphs (such as flame graphs or off-cpu * graphs). * * For walking a stack, this helper is an improvement over * **bpf_probe_read**\ (), which can be used with unrolled loops * but is not efficient and consumes a lot of eBPF instructions. * Instead, **bpf_get_stackid**\ () can collect up to * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that * this limit can be controlled with the **sysctl** program, and * that it should be manually increased in order to profile long * user stacks (such as stacks for Java programs). To do so, use: * * :: * * # sysctl kernel.perf_event_max_stack= * Return * The positive or null stack id on success, or a negative error * in case of failure. * * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed) * Description * Compute a checksum difference, from the raw buffer pointed by * *from*, of length *from_size* (that must be a multiple of 4), * towards the raw buffer pointed by *to*, of size *to_size* * (same remark). An optional *seed* can be added to the value * (this can be cascaded, the seed may come from a previous call * to the helper). * * This is flexible enough to be used in several ways: * * * With *from_size* == 0, *to_size* > 0 and *seed* set to * checksum, it can be used when pushing new data. * * With *from_size* > 0, *to_size* == 0 and *seed* set to * checksum, it can be used when removing data from a packet. * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it * can be used to compute a diff. Note that *from_size* and * *to_size* do not need to be equal. * * This helper can be used in combination with * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to * which one can feed in the difference computed with * **bpf_csum_diff**\ (). * Return * The checksum result, or a negative error code in case of * failure. * * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Retrieve tunnel options metadata for the packet associated to * *skb*, and store the raw tunnel option data to the buffer *opt* * of *size*. * * This helper can be used with encapsulation devices that can * operate in "collect metadata" mode (please refer to the related * note in the description of **bpf_skb_get_tunnel_key**\ () for * more details). A particular example where this can be used is * in combination with the Geneve encapsulation protocol, where it * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) * and retrieving arbitrary TLVs (Type-Length-Value headers) from * the eBPF program. This allows for full customization of these * headers. * Return * The size of the option data retrieved. * * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Set tunnel options metadata for the packet associated to *skb* * to the option data contained in the raw buffer *opt* of *size*. * * See also the description of the **bpf_skb_get_tunnel_opt**\ () * helper for additional information. * Return * 0 on success, or a negative error in case of failure. * * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) * Description * Change the protocol of the *skb* to *proto*. Currently * supported are transition from IPv4 to IPv6, and from IPv6 to * IPv4. The helper takes care of the groundwork for the * transition, including resizing the socket buffer. The eBPF * program is expected to fill the new headers, if any, via * **skb_store_bytes**\ () and to recompute the checksums with * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ * (). The main case for this helper is to perform NAT64 * operations out of an eBPF program. * * Internally, the GSO type is marked as dodgy so that headers are * checked and segments are recalculated by the GSO/GRO engine. * The size for GSO target is adapted as well. * * All values for *flags* are reserved for future usage, and must * be left at zero. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_skb_change_type(struct sk_buff *skb, u32 type) * Description * Change the packet type for the packet associated to *skb*. This * comes down to setting *skb*\ **->pkt_type** to *type*, except * the eBPF program does not have a write access to *skb*\ * **->pkt_type** beside this helper. Using a helper here allows * for graceful handling of errors. * * The major use case is to change incoming *skb*s to * **PACKET_HOST** in a programmatic way instead of having to * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for * example. * * Note that *type* only allows certain values. At this time, they * are: * * **PACKET_HOST** * Packet is for us. * **PACKET_BROADCAST** * Send packet to all. * **PACKET_MULTICAST** * Send packet to group. * **PACKET_OTHERHOST** * Send packet to someone else. * Return * 0 on success, or a negative error in case of failure. * * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) * Description * Check whether *skb* is a descendant of the cgroup2 held by * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. * Return * The return value depends on the result of the test, and can be: * * * 0, if the *skb* failed the cgroup2 descendant test. * * 1, if the *skb* succeeded the cgroup2 descendant test. * * A negative error code, if an error occurred. * * u32 bpf_get_hash_recalc(struct sk_buff *skb) * Description * Retrieve the hash of the packet, *skb*\ **->hash**. If it is * not set, in particular if the hash was cleared due to mangling, * recompute this hash. Later accesses to the hash can be done * directly with *skb*\ **->hash**. * * Calling **bpf_set_hash_invalid**\ (), changing a packet * prototype with **bpf_skb_change_proto**\ (), or calling * **bpf_skb_store_bytes**\ () with the * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear * the hash and to trigger a new computation for the next call to * **bpf_get_hash_recalc**\ (). * Return * The 32-bit hash. * * u64 bpf_get_current_task(void) * Description * Get the current task. * Return * A pointer to the current task struct. * * long bpf_probe_write_user(void *dst, const void *src, u32 len) * Description * Attempt in a safe way to write *len* bytes from the buffer * *src* to *dst* in memory. It only works for threads that are in * user context, and *dst* must be a valid user space address. * * This helper should not be used to implement any kind of * security mechanism because of TOC-TOU attacks, but rather to * debug, divert, and manipulate execution of semi-cooperative * processes. * * Keep in mind that this feature is meant for experiments, and it * has a risk of crashing the system and running programs. * Therefore, when an eBPF program using this helper is attached, * a warning including PID and process name is printed to kernel * logs. * Return * 0 on success, or a negative error in case of failure. * * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) * Description * Check whether the probe is being run is the context of a given * subset of the cgroup2 hierarchy. The cgroup2 to test is held by * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. * Return * The return value depends on the result of the test, and can be: * * * 1, if current task belongs to the cgroup2. * * 0, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) * Description * Resize (trim or grow) the packet associated to *skb* to the * new *len*. The *flags* are reserved for future usage, and must * be left at zero. * * The basic idea is that the helper performs the needed work to * change the size of the packet, then the eBPF program rewrites * the rest via helpers like **bpf_skb_store_bytes**\ (), * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () * and others. This helper is a slow path utility intended for * replies with control messages. And because it is targeted for * slow path, the helper itself can afford to be slow: it * implicitly linearizes, unclones and drops offloads from the * *skb*. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_skb_pull_data(struct sk_buff *skb, u32 len) * Description * Pull in non-linear data in case the *skb* is non-linear and not * all of *len* are part of the linear section. Make *len* bytes * from *skb* readable and writable. If a zero value is passed for * *len*, then all bytes in the linear part of *skb* will be made * readable and writable. * * This helper is only needed for reading and writing with direct * packet access. * * For direct packet access, testing that offsets to access * are within packet boundaries (test on *skb*\ **->data_end**) is * susceptible to fail if offsets are invalid, or if the requested * data is in non-linear parts of the *skb*. On failure the * program can just bail out, or in the case of a non-linear * buffer, use a helper to make the data available. The * **bpf_skb_load_bytes**\ () helper is a first solution to access * the data. Another one consists in using **bpf_skb_pull_data** * to pull in once the non-linear parts, then retesting and * eventually access the data. * * At the same time, this also makes sure the *skb* is uncloned, * which is a necessary condition for direct write. As this needs * to be an invariant for the write part only, the verifier * detects writes and adds a prologue that is calling * **bpf_skb_pull_data()** to effectively unclone the *skb* from * the very beginning in case it is indeed cloned. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum) * Description * Add the checksum *csum* into *skb*\ **->csum** in case the * driver has supplied a checksum for the entire packet into that * field. Return an error otherwise. This helper is intended to be * used in combination with **bpf_csum_diff**\ (), in particular * when the checksum needs to be updated after data has been * written into the packet through direct packet access. * Return * The checksum on success, or a negative error code in case of * failure. * * void bpf_set_hash_invalid(struct sk_buff *skb) * Description * Invalidate the current *skb*\ **->hash**. It can be used after * mangling on headers through direct packet access, in order to * indicate that the hash is outdated and to trigger a * recalculation the next time the kernel tries to access this * hash or when the **bpf_get_hash_recalc**\ () helper is called. * Return * void. * * long bpf_get_numa_node_id(void) * Description * Return the id of the current NUMA node. The primary use case * for this helper is the selection of sockets for the local NUMA * node, when the program is attached to sockets using the * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), * but the helper is also available to other eBPF program types, * similarly to **bpf_get_smp_processor_id**\ (). * Return * The id of current NUMA node. * * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) * Description * Grows headroom of packet associated to *skb* and adjusts the * offset of the MAC header accordingly, adding *len* bytes of * space. It automatically extends and reallocates memory as * required. * * This helper can be used on a layer 3 *skb* to push a MAC header * for redirection into a layer 2 device. * * All values for *flags* are reserved for future usage, and must * be left at zero. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that * it is possible to use a negative value for *delta*. This helper * can be used to prepare the packet for pushing or popping * headers. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for * more details. * * Generally, use **bpf_probe_read_user_str**\ () or * **bpf_probe_read_kernel_str**\ () instead. * Return * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative * value. * * u64 bpf_get_socket_cookie(struct sk_buff *skb) * Description * If the **struct sk_buff** pointed by *skb* has a known socket, * retrieve the cookie (generated by the kernel) of this socket. * If no cookie has been set yet, generate a new cookie. Once * generated, the socket cookie remains stable for the life of the * socket. This helper can be useful for monitoring per socket * networking traffic statistics as it provides a global socket * identifier that can be assumed unique. * Return * A 8-byte long unique number on success, or 0 if the socket * field is missing inside *skb*. * * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts * *skb*, but gets socket from **struct bpf_sock_addr** context. * Return * A 8-byte long unique number. * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return * A 8-byte long unique number. * * u64 bpf_get_socket_cookie(struct sock *sk) * Description * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts * *sk*, but gets socket from a BTF **struct sock**. This helper * also works for sleepable programs. * Return * A 8-byte long unique number or 0 if *sk* is NULL. * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Description * Get the owner UID of the socked associated to *skb*. * Return * The owner UID of the socket associated to *skb*. If the socket * is **NULL**, or if it is not a full socket (i.e. if it is a * time-wait or a request socket instead), **overflowuid** value * is returned (note that **overflowuid** might also be the actual * UID value for the socket). * * long bpf_set_hash(struct sk_buff *skb, u32 hash) * Description * Set the full hash for *skb* (set the field *skb*\ **->hash**) * to value *hash*. * Return * 0 * * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at * which the option resides and the name *optname* of the option * must be specified, see **setsockopt(2)** for more information. * The option value of length *optlen* is pointed by *optval*. * * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** * and **BPF_CGROUP_INET6_CONNECT**. * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: * * * **SOL_SOCKET**, which supports the following *optname*\ s: * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, * **SO_BINDTODEVICE**, **SO_KEEPALIVE**, **SO_REUSEADDR**, * **SO_REUSEPORT**, **SO_BINDTOIFINDEX**, **SO_TXREHASH**. * * **IPPROTO_TCP**, which supports the following *optname*\ s: * **TCP_CONGESTION**, **TCP_BPF_IW**, * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, * **TCP_BPF_RTO_MIN**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports the following *optname*\ s: * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. * Return * 0 on success, or a negative error in case of failure. * * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * * By default, the helper will reset any offloaded checksum * indicator of the skb to CHECKSUM_NONE. This can be avoided * by the following flag: * * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded * checksum data of the skb to CHECKSUM_NONE. * * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer * (room space is added or removed between the layer 2 and * layer 3 headers). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed between the layer 3 and * layer 4 headers). * * The following flags are supported at this time: * * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. * Adjusting mss in this way is not allowed for datagrams. * * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**, * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**: * Any new space is reserved to hold a tunnel header. * Configure skb offsets and other fields accordingly. * * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**, * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**: * Use with ENCAP_L3 flags to further specify the tunnel type. * * * **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*): * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the * L2 type as Ethernet. * * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: * Indicate the new IP header version after decapsulating the outer * IP header. Used when the inner and outer IP versions are different. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_redirect_map(struct bpf_map *map, u64 key, u64 flags) * Description * Redirect the packet to the endpoint referenced by *map* at * index *key*. Depending on its type, this *map* can contain * references to net devices (for forwarding packets through other * ports), or to CPUs (for redirecting XDP frames to another CPU; * but this is only implemented for native XDP (with driver * support) as of this writing). * * The lower two bits of *flags* are used as the return code if * the map lookup fails. This is so that the return value can be * one of the XDP program return codes up to **XDP_TX**, as chosen * by the caller. The higher bits of *flags* can be set to * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below. * * With BPF_F_BROADCAST the packet will be broadcasted to all the * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress * interface will be excluded when do broadcasting. * * See also **bpf_redirect**\ (), which only supports redirecting * to an ifindex, but doesn't require a map to do so. * Return * **XDP_REDIRECT** on success, or the value of the two lower bits * of the *flags* argument on error. * * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and * egress interfaces can be used for redirection. The * **BPF_F_INGRESS** value in *flags* is used to make the * distinction (ingress path is selected if the flag is present, * egress path otherwise). This is the only flag supported for now. * Return * **SK_PASS** on success, or **SK_DROP** on error. * * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a *map* referencing sockets. The * *skops* is used as a new value for the entry associated to * *key*. *flags* is one of: * * **BPF_NOEXIST** * The entry for *key* must not exist in the map. * **BPF_EXIST** * The entry for *key* must already exist in the map. * **BPF_ANY** * No condition on the existence of the entry for *key*. * * If the *map* has eBPF programs (parser and verdict), those will * be inherited by the socket being added. If the socket is * already attached to eBPF programs, this results in an error. * Return * 0 on success, or a negative error in case of failure. * * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) * Description * Adjust the address pointed by *xdp_md*\ **->data_meta** by * *delta* (which can be positive or negative). Note that this * operation modifies the address stored in *xdp_md*\ **->data**, * so the latter must be loaded only after the helper has been * called. * * The use of *xdp_md*\ **->data_meta** is optional and programs * are not required to use it. The rationale is that when the * packet is processed with XDP (e.g. as DoS filter), it is * possible to push further meta data along with it before passing * to the stack, and to give the guarantee that an ingress eBPF * program attached as a TC classifier on the same device can pick * this up for further post-processing. Since TC works with socket * buffers, it remains possible to set from XDP the **mark** or * **priority** pointers, or other pointers for the socket buffer. * Having this scratch space generic and programmable allows for * more flexibility as the user is free to store whatever meta * data they need. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) * Description * Read the value of a perf event counter, and store it into *buf* * of size *buf_size*. This helper relies on a *map* of type * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event * counter is selected when *map* is updated with perf event file * descriptors. The *map* is an array whose size is the number of * available CPUs, and each cell contains a value relative to one * CPU. The value to retrieve is indicated by *flags*, that * contains the index of the CPU to look up, masked with * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to * **BPF_F_CURRENT_CPU** to indicate that the value for the * current CPU should be retrieved. * * This helper behaves in a way close to * **bpf_perf_event_read**\ () helper, save that instead of * just returning the value observed, it fills the *buf* * structure. This allows for additional data to be retrieved: in * particular, the enabled and running times (in *buf*\ * **->enabled** and *buf*\ **->running**, respectively) are * copied. In general, **bpf_perf_event_read_value**\ () is * recommended over **bpf_perf_event_read**\ (), which has some * ABI issues and provides fewer functionalities. * * These values are interesting, because hardware PMU (Performance * Monitoring Unit) counters are limited resources. When there are * more PMU based perf events opened than available counters, * kernel will multiplex these events so each event gets certain * percentage (but not all) of the PMU time. In case that * multiplexing happens, the number of samples or counter value * will not reflect the case compared to when no multiplexing * occurs. This makes comparison between different runs difficult. * Typically, the counter value should be normalized before * comparing to other experiments. The usual normalization is done * as follows. * * :: * * normalized_counter = counter * t_enabled / t_running * * Where t_enabled is the time enabled for event and t_running is * the time running for event since last normalization. The * enabled and running times are accumulated since the perf event * open. To achieve scaling factor between two invocations of an * eBPF program, users can use CPU id as the key (which is * typical for perf array usage model) to remember the previous * value and do the calculation inside the eBPF program. * Return * 0 on success, or a negative error in case of failure. * * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) * Description * For an eBPF program attached to a perf event, retrieve the * value of the event counter associated to *ctx* and store it in * the structure pointed by *buf* and of size *buf_size*. Enabled * and running times are also stored in the structure (see * description of helper **bpf_perf_event_read_value**\ () for * more details). * Return * 0 on success, or a negative error in case of failure. * * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at * which the option resides and the name *optname* of the option * must be specified, see **getsockopt(2)** for more information. * The retrieved value is stored in the structure pointed by * *opval* and of length *optlen*. * * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** * and **BPF_CGROUP_INET6_CONNECT**. * * This helper actually implements a subset of **getsockopt()**. * It supports the same set of *optname*\ s that is supported by * the **bpf_setsockopt**\ () helper. The exceptions are * **TCP_BPF_*** is **bpf_setsockopt**\ () only and * **TCP_SAVED_SYN** is **bpf_getsockopt**\ () only. * Return * 0 on success, or a negative error in case of failure. * * long bpf_override_return(struct pt_regs *regs, u64 rc) * Description * Used for error injection, this helper uses kprobes to override * the return value of the probed function, and to set it to *rc*. * The first argument is the context *regs* on which the kprobe * works. * * This helper works by setting the PC (program counter) * to an override function which is run in place of the original * probed function. This means the probed function is not run at * all. The replacement function just returns with the required * value. * * This helper has security implications, and thus is subject to * restrictions. It is only available if the kernel was compiled * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration * option, and in this case it only works on functions tagged with * **ALLOW_ERROR_INJECTION** in the kernel code. * * Also, the helper is only available for the architectures having * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, * x86 architecture is the only one to support this feature. * Return * 0 * * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) * Description * Attempt to set the value of the **bpf_sock_ops_cb_flags** field * for the full TCP socket associated to *bpf_sock_ops* to * *argval*. * * The primary use of this field is to determine if there should * be calls to eBPF programs of type * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP * code. A program of the same type can change its value, per * connection and as necessary, when the connection is * established. This field is directly accessible for reading, but * this helper must be used for updates in order to return an * error if an eBPF program tries to set a callback that is not * supported in the current kernel. * * *argval* is a flag array which can combine these flags: * * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT) * * Therefore, this function can be used to clear a callback flag by * setting the appropriate bit to zero. e.g. to disable the RTO * callback: * * **bpf_sock_ops_cb_flags_set(bpf_sock,** * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** * * Here are some examples of where one could call such eBPF * program: * * * When RTO fires. * * When a packet is retransmitted. * * When the connection terminates. * * When a packet is sent. * * When a packet is received. * Return * Code **-EINVAL** if the socket is not a full TCP socket; * otherwise, a positive number containing the bits that could not * be set is returned (which comes down to 0 if all bits were set * as required). * * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if * the verdict eBPF program returns **SK_PASS**), redirect it to * the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and * egress interfaces can be used for redirection. The * **BPF_F_INGRESS** value in *flags* is used to make the * distinction (ingress path is selected if the flag is present, * egress path otherwise). This is the only flag supported for now. * Return * **SK_PASS** on success, or **SK_DROP** on error. * * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, apply the verdict of the eBPF program to * the next *bytes* (number of bytes) of message *msg*. * * For example, this helper can be used in the following cases: * * * A single **sendmsg**\ () or **sendfile**\ () system call * contains multiple logical messages that the eBPF program is * supposed to read and for which it should apply a verdict. * * An eBPF program only cares to read the first *bytes* of a * *msg*. If the message has a large payload, then setting up * and calling the eBPF program repeatedly for all bytes, even * though the verdict is already known, would create unnecessary * overhead. * * When called from within an eBPF program, the helper sets a * counter internal to the BPF infrastructure, that is used to * apply the last verdict to the next *bytes*. If *bytes* is * smaller than the current data being processed from a * **sendmsg**\ () or **sendfile**\ () system call, the first * *bytes* will be sent and the eBPF program will be re-run with * the pointer for start of data pointing to byte number *bytes* * **+ 1**. If *bytes* is larger than the current data being * processed, then the eBPF verdict will be applied to multiple * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are * consumed. * * Note that if a socket closes with the internal counter holding * a non-zero value, this is not a problem because data is not * being buffered for *bytes* and is sent as it is received. * Return * 0 * * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, prevent the execution of the verdict eBPF * program for message *msg* until *bytes* (byte number) have been * accumulated. * * This can be used when one needs a specific number of bytes * before a verdict can be assigned, even if the data spans * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme * case would be a user calling **sendmsg**\ () repeatedly with * 1-byte long message segments. Obviously, this is bad for * performance, but it is still valid. If the eBPF program needs * *bytes* bytes to validate a header, this helper can be used to * prevent the eBPF program to be called again until *bytes* have * been accumulated. * Return * 0 * * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) * Description * For socket policies, pull in non-linear data from user space * for *msg* and set pointers *msg*\ **->data** and *msg*\ * **->data_end** to *start* and *end* bytes offsets into *msg*, * respectively. * * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a * *msg* it can only parse data that the (**data**, **data_end**) * pointers have already consumed. For **sendmsg**\ () hooks this * is likely the first scatterlist element. But for calls relying * on the **sendpage** handler (e.g. **sendfile**\ ()) this will * be the range (**0**, **0**) because the data is shared with * user space and by default the objective is to avoid allowing * user space to modify data while (or after) eBPF verdict is * being decided. This helper can be used to pull in data and to * set the start and end pointer to given values. Data will be * copied if necessary (i.e. if data was not linear and if start * and end pointers do not point to the same chunk). * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * * All values for *flags* are reserved for future usage, and must * be left at zero. * Return * 0 on success, or a negative error in case of failure. * * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) * Description * Bind the socket associated to *ctx* to the address pointed by * *addr*, of length *addr_len*. This allows for making outgoing * connection from the desired IP address, which can be useful for * example when all processes inside a cgroup should use one * single IP address on a host that has multiple IP configured. * * This helper works for IPv4 and IPv6, TCP and UDP sockets. The * domain (*addr*\ **->sa_family**) must be **AF_INET** (or * **AF_INET6**). It's advised to pass zero port (**sin_port** * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like * behavior and lets the kernel efficiently pick up an unused * port as long as 4-tuple is unique. Passing non-zero port might * lead to degraded performance. * Return * 0 on success, or a negative error in case of failure. * * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is * possible to both shrink and grow the packet tail. * Shrink done via *delta* being a negative integer. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) * Description * Retrieve the XFRM state (IP transform framework, see also * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. * * The retrieved value is stored in the **struct bpf_xfrm_state** * pointed by *xfrm_state* and of length *size*. * * All values for *flags* are reserved for future usage, and must * be left at zero. * * This helper is available only if the kernel was compiled with * **CONFIG_XFRM** configuration option. * Return * 0 on success, or a negative error in case of failure. * * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *ctx*, which is a pointer * to the context on which the tracing program is executed. * To store the stacktrace, the bpf program provides *buf* with * a nonnegative *size*. * * The last argument, *flags*, holds the number of stack frames to * skip (from 0 to 255), masked with * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set * the following flags: * * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. * **BPF_F_USER_BUILD_ID** * Collect (build_id, file_offset) instead of ips for user * stack, only valid if **BPF_F_USER_STACK** is also * specified. * * *file_offset* is an offset relative to the beginning * of the executable or shared object file backing the vma * which the *ip* falls in. It is *not* an offset relative * to that object's base address. Accordingly, it must be * adjusted by adding (sh_addr - sh_offset), where * sh_{addr,offset} correspond to the executable section * containing *file_offset* in the object, for comparisons * to symbols' st_value to be valid. * * **bpf_get_stack**\ () can collect up to * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject * to sufficient large buffer size. Note that * this limit can be controlled with the **sysctl** program, and * that it should be manually increased in order to profile long * user stacks (such as stacks for Java programs). To do so, use: * * :: * * # sysctl kernel.perf_event_max_stack= * Return * The non-negative copied *buf* length equal to or less than * *size* on success, or a negative error in case of failure. * * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that * it provides an easy way to load *len* bytes from *offset* * from the packet associated to *skb*, into the buffer pointed * by *to*. The difference to **bpf_skb_load_bytes**\ () is that * a fifth argument *start_header* exists in order to select a * base offset to start from. *start_header* can be one of: * * **BPF_HDR_START_MAC** * Base offset to load data from is *skb*'s mac header. * **BPF_HDR_START_NET** * Base offset to load data from is *skb*'s network header. * * In general, "direct packet access" is the preferred method to * access packet data, however, this helper is in particular useful * in socket filters where *skb*\ **->data** does not always point * to the start of the mac header and where "direct packet access" * is not available. * Return * 0 on success, or a negative error in case of failure. * * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) * Description * Do FIB lookup in kernel tables using parameters in *params*. * If lookup is successful and result shows packet is to be * forwarded, the neighbor tables are searched for the nexthop. * If successful (ie., FIB lookup shows forwarding and nexthop * is resolved), the nexthop address is returned in ipv4_dst * or ipv6_dst based on family, smac is set to mac address of * egress device, dmac is set to nexthop mac address, rt_metric * is set to metric from route (IPv4/IPv6 only), and ifindex * is set to the device index of the nexthop from the FIB lookup. * * *plen* argument is the size of the passed in struct. * *flags* argument can be a combination of one or more of the * following values: * * **BPF_FIB_LOOKUP_DIRECT** * Do a direct table lookup vs full lookup using FIB * rules. * **BPF_FIB_LOOKUP_OUTPUT** * Perform lookup from an egress perspective (default is * ingress). * **BPF_FIB_LOOKUP_SKIP_NEIGH** * Skip the neighbour table lookup. *params*->dmac * and *params*->smac will not be set as output. A common * use case is to call **bpf_redirect_neigh**\ () after * doing **bpf_fib_lookup**\ (). * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. * Return * * < 0 if any input argument is invalid * * 0 on success (packet is forwarded, nexthop neighbor exists) * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU * was exceeded and output params->mtu_result contains the MTU. * * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. * The *skops* is used as a new value for the entry associated to * *key*. *flags* is one of: * * **BPF_NOEXIST** * The entry for *key* must not exist in the map. * **BPF_EXIST** * The entry for *key* must already exist in the map. * **BPF_ANY** * No condition on the existence of the entry for *key*. * * If the *map* has eBPF programs (parser and verdict), those will * be inherited by the socket being added. If the socket is * already attached to eBPF programs, this results in an error. * Return * 0 on success, or a negative error in case of failure. * * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if * the verdict eBPF program returns **SK_PASS**), redirect it to * the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and * egress interfaces can be used for redirection. The * **BPF_F_INGRESS** value in *flags* is used to make the * distinction (ingress path is selected if the flag is present, * egress path otherwise). This is the only flag supported for now. * Return * **SK_PASS** on success, or **SK_DROP** on error. * * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. * if the verdict eBPF program returns **SK_PASS**), redirect it * to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and * egress interfaces can be used for redirection. The * **BPF_F_INGRESS** value in *flags* is used to make the * distinction (ingress path is selected if the flag is present, * egress otherwise). This is the only flag supported for now. * Return * **SK_PASS** on success, or **SK_DROP** on error. * * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) * Description * Encapsulate the packet associated to *skb* within a Layer 3 * protocol header. This header is provided in the buffer at * address *hdr*, with *len* its size in bytes. *type* indicates * the protocol of the header and can be one of: * * **BPF_LWT_ENCAP_SEG6** * IPv6 encapsulation with Segment Routing Header * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, * the IPv6 header is computed by the kernel. * **BPF_LWT_ENCAP_SEG6_INLINE** * Only works if *skb* contains an IPv6 packet. Insert a * Segment Routing Header (**struct ipv6_sr_hdr**) inside * the IPv6 header. * **BPF_LWT_ENCAP_IP** * IP encapsulation (GRE/GUE/IPIP/etc). The outer header * must be IPv4 or IPv6, followed by zero or more * additional headers, up to **LWT_BPF_MAX_HEADROOM** * total bytes in all prepended headers. Please note that * if **skb_is_gso**\ (*skb*) is true, no more than two * headers can be prepended, and the inner header, if * present, should be either GRE or UDP/GUE. * * **BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs * of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can * be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and * **BPF_PROG_TYPE_LWT_XMIT**. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. Only the flags, tag and TLVs * inside the outermost IPv6 Segment Routing Header can be * modified through this helper. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) * Description * Adjust the size allocated to TLVs in the outermost IPv6 * Segment Routing Header contained in the packet associated to * *skb*, at position *offset* by *delta* bytes. Only offsets * after the segments are accepted. *delta* can be as well * positive (growing) as negative (shrinking). * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) * Description * Apply an IPv6 Segment Routing action of type *action* to the * packet associated to *skb*. Each action takes a parameter * contained at address *param*, and of length *param_len* bytes. * *action* can be one of: * * **SEG6_LOCAL_ACTION_END_X** * End.X action: Endpoint with Layer-3 cross-connect. * Type of *param*: **struct in6_addr**. * **SEG6_LOCAL_ACTION_END_T** * End.T action: Endpoint with specific IPv6 table lookup. * Type of *param*: **int**. * **SEG6_LOCAL_ACTION_END_B6** * End.B6 action: Endpoint bound to an SRv6 policy. * Type of *param*: **struct ipv6_sr_hdr**. * **SEG6_LOCAL_ACTION_END_B6_ENCAP** * End.B6.Encap action: Endpoint bound to an SRv6 * encapsulation policy. * Type of *param*: **struct ipv6_sr_hdr**. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with * direct packet access. * Return * 0 on success, or a negative error in case of failure. * * long bpf_rc_repeat(void *ctx) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded repeat key message. This delays * the generation of a key up event for previously generated * key down event. * * Some IR protocols like NEC have a special IR message for * repeating last button, for when a button is held down. * * The *ctx* should point to the lirc sample as passed into * the program. * * This helper is only available is the kernel was compiled with * the **CONFIG_BPF_LIRC_MODE2** configuration option set to * "**y**". * Return * 0 * * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded key press with *scancode*, * *toggle* value in the given *protocol*. The scancode will be * translated to a keycode using the rc keymap, and reported as * an input key down event. After a period a key up event is * generated. This period can be extended by calling either * **bpf_rc_keydown**\ () again with the same values, or calling * **bpf_rc_repeat**\ (). * * Some protocols include a toggle bit, in case the button was * released and pressed again between consecutive scancodes. * * The *ctx* should point to the lirc sample as passed into * the program. * * The *protocol* is the decoded protocol number (see * **enum rc_proto** for some predefined values). * * This helper is only available is the kernel was compiled with * the **CONFIG_BPF_LIRC_MODE2** configuration option set to * "**y**". * Return * 0 * * u64 bpf_skb_cgroup_id(struct sk_buff *skb) * Description * Return the cgroup v2 id of the socket associated with the *skb*. * This is roughly similar to the **bpf_get_cgroup_classid**\ () * helper for cgroup v1 by providing a tag resp. identifier that * can be matched on or used for map lookups e.g. to implement * policy. The cgroup v2 id of a given path in the hierarchy is * exposed in user space through the f_handle API in order to get * to the same 64-bit id. * * This helper can be used on TC egress path, but not on ingress, * and is available only if the kernel was compiled with the * **CONFIG_SOCK_CGROUP_DATA** configuration option. * Return * The id is returned or 0 in case the id could not be retrieved. * * u64 bpf_get_current_cgroup_id(void) * Description * Get the current cgroup id based on the cgroup within which * the current task is running. * Return * A 64-bit integer containing the current cgroup id based * on the cgroup within which the current task is running. * * void *bpf_get_local_storage(void *map, u64 flags) * Description * Get the pointer to the local storage area. * The type and the size of the local storage is defined * by the *map* argument. * The *flags* meaning is specific for each map type, * and has to be 0 for cgroup local storage. * * Depending on the BPF program type, a local storage area * can be shared between multiple instances of the BPF program, * running simultaneously. * * A user should care about the synchronization by himself. * For example, by using the **BPF_ATOMIC** instructions to alter * the shared data. * Return * A pointer to the local storage area. * * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description * Select a **SO_REUSEPORT** socket from a * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. * It checks the selected socket is matching the incoming * request in the socket buffer. * Return * 0 on success, or a negative error in case of failure. * * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) * Description * Return id of cgroup v2 that is ancestor of cgroup associated * with the *skb* at the *ancestor_level*. The root cgroup is at * *ancestor_level* zero and each step down the hierarchy * increments the level. If *ancestor_level* == level of cgroup * associated with *skb*, then return value will be same as that * of **bpf_skb_cgroup_id**\ (). * * The helper is useful to implement policies based on cgroups * that are upper in hierarchy than immediate cgroup associated * with *skb*. * * The format of returned id and helper limitations are same as in * **bpf_skb_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. * * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for TCP socket matching *tuple*, optionally in a child * network namespace *netns*. The return value must be checked, * and if non-**NULL**, released via **bpf_sk_release**\ (). * * The *ctx* should point to the context of the program, such as * the skb or socket (depending on the hook in use). This is used * to determine the base network namespace for the lookup. * * *tuple_size* must be one of: * * **sizeof**\ (*tuple*\ **->ipv4**) * Look for an IPv4 socket. * **sizeof**\ (*tuple*\ **->ipv6**) * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or * equal to zero then it specifies the ID of the netns relative to * the netns associated with the *ctx*. *netns* values beyond the * range of 32-bit integers are reserved for future use. * * All values for *flags* are reserved for future usage, and must * be left at zero. * * This helper is available only if the kernel was compiled with * **CONFIG_NET** configuration option. * Return * Pointer to **struct bpf_sock**, or **NULL** in case of failure. * For sockets with reuseport option, the **struct bpf_sock** * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for UDP socket matching *tuple*, optionally in a child * network namespace *netns*. The return value must be checked, * and if non-**NULL**, released via **bpf_sk_release**\ (). * * The *ctx* should point to the context of the program, such as * the skb or socket (depending on the hook in use). This is used * to determine the base network namespace for the lookup. * * *tuple_size* must be one of: * * **sizeof**\ (*tuple*\ **->ipv4**) * Look for an IPv4 socket. * **sizeof**\ (*tuple*\ **->ipv6**) * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or * equal to zero then it specifies the ID of the netns relative to * the netns associated with the *ctx*. *netns* values beyond the * range of 32-bit integers are reserved for future use. * * All values for *flags* are reserved for future usage, and must * be left at zero. * * This helper is available only if the kernel was compiled with * **CONFIG_NET** configuration option. * Return * Pointer to **struct bpf_sock**, or **NULL** in case of failure. * For sockets with reuseport option, the **struct bpf_sock** * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * * long bpf_sk_release(void *sock) * Description * Release the reference held by *sock*. *sock* must be a * non-**NULL** pointer that was returned from * **bpf_sk_lookup_xxx**\ (). * Return * 0 on success, or a negative error in case of failure. * * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) * Description * Push an element *value* in *map*. *flags* is one of: * * **BPF_EXIST** * If the queue/stack is full, the oldest element is * removed to make room for this. * Return * 0 on success, or a negative error in case of failure. * * long bpf_map_pop_elem(struct bpf_map *map, void *value) * Description * Pop an element from *map*. * Return * 0 on success, or a negative error in case of failure. * * long bpf_map_peek_elem(struct bpf_map *map, void *value) * Description * Get an element from *map* without removing it. * Return * 0 on success, or a negative error in case of failure. * * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * For socket policies, insert *len* bytes into *msg* at offset * *start*. * * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a * *msg* it may want to insert metadata or options into the *msg*. * This can later be read and used by any of the lower layer BPF * hooks. * * This helper may fail if under memory pressure (a malloc * fails) in these cases BPF programs will get an appropriate * error and BPF programs will need to handle them. * Return * 0 on success, or a negative error in case of failure. * * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * Will remove *len* bytes from a *msg* starting at byte *start*. * This may result in **ENOMEM** errors under certain situations if * an allocation and copy are required due to a full ring buffer. * However, the helper will try to avoid doing the allocation * if possible. Other errors can occur if input parameters are * invalid either due to *start* byte not being valid part of *msg* * payload and/or *pop* value being to large. * Return * 0 on success, or a negative error in case of failure. * * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded pointer movement. * * The *ctx* should point to the lirc sample as passed into * the program. * * This helper is only available is the kernel was compiled with * the **CONFIG_BPF_LIRC_MODE2** configuration option set to * "**y**". * Return * 0 * * long bpf_spin_lock(struct bpf_spin_lock *lock) * Description * Acquire a spinlock represented by the pointer *lock*, which is * stored as part of a value of a map. Taking the lock allows to * safely update the rest of the fields in that value. The * spinlock can (and must) later be released with a call to * **bpf_spin_unlock**\ (\ *lock*\ ). * * Spinlocks in BPF programs come with a number of restrictions * and constraints: * * * **bpf_spin_lock** objects are only allowed inside maps of * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this * list could be extended in the future). * * BTF description of the map is mandatory. * * The BPF program can take ONE lock at a time, since taking two * or more could cause dead locks. * * Only one **struct bpf_spin_lock** is allowed per map element. * * When the lock is taken, calls (either BPF to BPF or helpers) * are not allowed. * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not * allowed inside a spinlock-ed region. * * The BPF program MUST call **bpf_spin_unlock**\ () to release * the lock, on all execution paths, before it returns. * * The BPF program can access **struct bpf_spin_lock** only via * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () * helpers. Loading or storing data into the **struct * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. * * To use the **bpf_spin_lock**\ () helper, the BTF description * of the map value must be a struct and have **struct * bpf_spin_lock** *anyname*\ **;** field at the top level. * Nested lock inside another struct is not allowed. * * The **struct bpf_spin_lock** *lock* field in a map value must * be aligned on a multiple of 4 bytes in that value. * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy * the **bpf_spin_lock** field to user space. * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from * a BPF program, do not update the **bpf_spin_lock** field. * * **bpf_spin_lock** cannot be on the stack or inside a * networking packet (it can only be inside of a map values). * * **bpf_spin_lock** is available to root only. * * Tracing programs and socket filter programs cannot use * **bpf_spin_lock**\ () due to insufficient preemption checks * (but this may change in the future). * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. * Return * 0 * * long bpf_spin_unlock(struct bpf_spin_lock *lock) * Description * Release the *lock* previously locked by a call to * **bpf_spin_lock**\ (\ *lock*\ ). * Return * 0 * * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_sock** pointer such * that all the fields in this **bpf_sock** can be accessed. * Return * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. * * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_tcp_sock** pointer from a * **struct bpf_sock** pointer. * Return * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * * long bpf_skb_ecn_set_ce(struct sk_buff *skb) * Description * Set ECN (Explicit Congestion Notification) field of IP header * to **CE** (Congestion Encountered) if current value is **ECT** * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 * and IPv4. * Return * 1 if the **CE** flag is set (either by the current helper call * or because it was already present), 0 if it is not set. * * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) * Description * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. * **bpf_sk_release**\ () is unnecessary and not allowed. * Return * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. * * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for TCP socket matching *tuple*, optionally in a child * network namespace *netns*. The return value must be checked, * and if non-**NULL**, released via **bpf_sk_release**\ (). * * This function is identical to **bpf_sk_lookup_tcp**\ (), except * that it also returns timewait or request sockets. Use * **bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the * full structure. * * This helper is available only if the kernel was compiled with * **CONFIG_NET** configuration option. * Return * Pointer to **struct bpf_sock**, or **NULL** in case of failure. * For sockets with reuseport option, the **struct bpf_sock** * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * * long bpf_tcp_check_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK for * the listening socket in *sk*. * * *iph* points to the start of the IPv4 or IPv6 header, while * *iph_len* contains **sizeof**\ (**struct iphdr**) or * **sizeof**\ (**struct ipv6hdr**). * * *th* points to the start of the TCP header, while *th_len* * contains the length of the TCP header (at least * **sizeof**\ (**struct tcphdr**)). * Return * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * error otherwise. * * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) * Description * Get name of sysctl in /proc/sys/ and copy it into provided by * program buffer *buf* of size *buf_len*. * * The buffer is always NUL terminated, unless it's zero-sized. * * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name * only (e.g. "tcp_mem"). * Return * Number of character copied (not including the trailing NUL). * * **-E2BIG** if the buffer wasn't big enough (*buf* will contain * truncated name in this case). * * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get current value of sysctl as it is presented in /proc/sys * (incl. newline, etc), and copy it as a string into provided * by program buffer *buf* of size *buf_len*. * * The whole value is copied, no matter what file position user * space issued e.g. sys_read at. * * The buffer is always NUL terminated, unless it's zero-sized. * Return * Number of character copied (not including the trailing NUL). * * **-E2BIG** if the buffer wasn't big enough (*buf* will contain * truncated name in this case). * * **-EINVAL** if current value was unavailable, e.g. because * sysctl is uninitialized and read returns -EIO for it. * * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get new value being written by user space to sysctl (before * the actual write happens) and copy it as a string into * provided by program buffer *buf* of size *buf_len*. * * User space may write new value at file position > 0. * * The buffer is always NUL terminated, unless it's zero-sized. * Return * Number of character copied (not including the trailing NUL). * * **-E2BIG** if the buffer wasn't big enough (*buf* will contain * truncated name in this case). * * **-EINVAL** if sysctl is being read. * * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) * Description * Override new value being written by user space to sysctl with * value provided by program in buffer *buf* of size *buf_len*. * * *buf* should contain a string in same form as provided by user * space on sysctl write. * * User space may write new value at file position > 0. To override * the whole sysctl value file position should be set to zero. * Return * 0 on success. * * **-E2BIG** if the *buf_len* is too big. * * **-EINVAL** if sysctl is being read. * * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to a long integer according to the given base * and save the result in *res*. * * The string may begin with an arbitrary amount of white space * (as determined by **isspace**\ (3)) followed by a single * optional '**-**' sign. * * Five least significant bits of *flags* encode base, other bits * are currently unused. * * Base must be either 8, 10, 16 or 0 to detect it automatically * similar to user space **strtol**\ (3). * Return * Number of characters consumed on success. Must be positive but * no more than *buf_len*. * * **-EINVAL** if no valid digits were found or unsupported base * was provided. * * **-ERANGE** if resulting value was out of range. * * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to an unsigned long integer according to the * given base and save the result in *res*. * * The string may begin with an arbitrary amount of white space * (as determined by **isspace**\ (3)). * * Five least significant bits of *flags* encode base, other bits * are currently unused. * * Base must be either 8, 10, 16 or 0 to detect it automatically * similar to user space **strtoul**\ (3). * Return * Number of characters consumed on success. Must be positive but * no more than *buf_len*. * * **-EINVAL** if no valid digits were found or unsupported base * was provided. * * **-ERANGE** if resulting value was out of range. * * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) * Description * Get a bpf-local-storage from a *sk*. * * Logically, it could be thought of getting the value from * a *map* with *sk* as the **key**. From this * perspective, the usage is not much different from * **bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this * helper enforces the key must be a full socket and the map must * be a **BPF_MAP_TYPE_SK_STORAGE** also. * * Underneath, the value is stored locally at *sk* instead of * the *map*. The *map* is used as the bpf-local-storage * "type". The bpf-local-storage "type" (i.e. the *map*) is * searched against all bpf-local-storages residing at *sk*. * * *sk* is a kernel **struct sock** pointer for LSM program. * *sk* is a **struct bpf_sock** pointer for other program types. * * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be * used such that a new bpf-local-storage will be * created if one does not exist. *value* can be used * together with **BPF_SK_STORAGE_GET_F_CREATE** to specify * the initial value of a bpf-local-storage. If *value* is * **NULL**, the new bpf-local-storage will be zero initialized. * Return * A bpf-local-storage pointer is returned on success. * * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * * long bpf_sk_storage_delete(struct bpf_map *map, void *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return * 0 on success. * * **-ENOENT** if the bpf-local-storage cannot be found. * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). * * long bpf_send_signal(u32 sig) * Description * Send signal *sig* to the process of the current task. * The signal may be delivered to any of this process's threads. * Return * 0 on success or successfully queued. * * **-EBUSY** if work queue under nmi is full. * * **-EINVAL** if *sig* is invalid. * * **-EPERM** if no permission to send the *sig*. * * **-EAGAIN** if bpf program can try again. * * s64 bpf_tcp_gen_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Try to issue a SYN cookie for the packet with corresponding * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. * * *iph* points to the start of the IPv4 or IPv6 header, while * *iph_len* contains **sizeof**\ (**struct iphdr**) or * **sizeof**\ (**struct ipv6hdr**). * * *th* points to the start of the TCP header, while *th_len* * contains the length of the TCP header with options (at least * **sizeof**\ (**struct tcphdr**)). * Return * On success, lower 32 bits hold the generated SYN cookie in * followed by 16 bits which hold the MSS value for that cookie, * and the top 16 bits are unused. * * On failure, the returned value is one of the following: * * **-EINVAL** SYN cookie cannot be issued due to error * * **-ENOENT** SYN cookie should not be issued (no SYN flood) * * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 * * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf * event must have the following attributes: **PERF_SAMPLE_RAW** * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. * * The *flags* are used to indicate the index in *map* for which * the value must be put, masked with **BPF_F_INDEX_MASK**. * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** * to indicate that the index of the current CPU core should be * used. * * The value to write, of *size*, is passed through eBPF stack and * pointed by *data*. * * *ctx* is a pointer to in-kernel struct sk_buff. * * This helper is similar to **bpf_perf_event_output**\ () but * restricted to raw_tracepoint bpf programs. * Return * 0 on success, or a negative error in case of failure. * * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from user space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from kernel space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe user address * *unsafe_ptr* to *dst*. The *size* should include the * terminating NUL byte. In case the string length is smaller than * *size*, the target is not padded with further NUL bytes. If the * string length is larger than *size*, just *size*-1 bytes are * copied and the last byte is set to NUL. * * On success, returns the number of bytes that were written, * including the terminal NUL. This makes this helper useful in * tracing programs for reading strings, and more importantly to * get its length at runtime. See the following snippet: * * :: * * SEC("kprobe/sys_open") * void bpf_sys_open(struct pt_regs *ctx) * { * char buf[PATHLEN]; // PATHLEN is defined to 256 * int res = bpf_probe_read_user_str(buf, sizeof(buf), * ctx->di); * * // Consume buf, for example push it to * // userspace via bpf_perf_event_output(); we * // can use res (the string length) as event * // size, after checking its boundaries. * } * * In comparison, using **bpf_probe_read_user**\ () helper here * instead to read the string would require to estimate the length * at compile time, and would often result in copying more memory * than necessary. * * Another useful use case is when parsing individual process * arguments or individual environment variables navigating * *current*\ **->mm->arg_start** and *current*\ * **->mm->env_start**: using this helper and the return value, * one can quickly iterate at the right offset of the memory area. * Return * On success, the strictly positive length of the output string, * including the trailing NUL character. On error, a negative * value. * * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. * Return * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. * * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt) * Description * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. * *rcv_nxt* is the ack_seq to be sent out. * Return * 0 on success, or a negative error in case of failure. * * long bpf_send_signal_thread(u32 sig) * Description * Send signal *sig* to the thread corresponding to the current task. * Return * 0 on success or successfully queued. * * **-EBUSY** if work queue under nmi is full. * * **-EINVAL** if *sig* is invalid. * * **-EPERM** if no permission to send the *sig*. * * **-EAGAIN** if bpf program can try again. * * u64 bpf_jiffies64(void) * Description * Obtain the 64bit jiffies * Return * The 64 bit jiffies * * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) * Description * For an eBPF program attached to a perf event, retrieve the * branch records (**struct perf_branch_entry**) associated to *ctx* * and store it in the buffer pointed by *buf* up to size * *size* bytes. * Return * On success, number of bytes written to *buf*. On error, a * negative value. * * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to * instead return the number of bytes required to store all the * branch entries. If this flag is set, *buf* may be NULL. * * **-EINVAL** if arguments invalid or **size** not a multiple * of **sizeof**\ (**struct perf_branch_entry**\ ). * * **-ENOENT** if architecture does not support branch records. * * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) * Description * Returns 0 on success, values for *pid* and *tgid* as seen from the current * *namespace* will be returned in *nsdata*. * Return * 0 on success, or one of the following in case of failure: * * **-EINVAL** if dev and inum supplied don't match dev_t and inode number * with nsfs of current task, or if dev conversion to dev_t lost high bits. * * **-ENOENT** if pidns does not exists for the current task. * * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf * event must have the following attributes: **PERF_SAMPLE_RAW** * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. * * The *flags* are used to indicate the index in *map* for which * the value must be put, masked with **BPF_F_INDEX_MASK**. * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** * to indicate that the index of the current CPU core should be * used. * * The value to write, of *size*, is passed through eBPF stack and * pointed by *data*. * * *ctx* is a pointer to in-kernel struct xdp_buff. * * This helper is similar to **bpf_perf_eventoutput**\ () but * restricted to raw_tracepoint bpf programs. * Return * 0 on success, or a negative error in case of failure. * * u64 bpf_get_netns_cookie(void *ctx) * Description * Retrieve the cookie (generated by the kernel) of the network * namespace the input *ctx* is associated with. The network * namespace cookie remains stable for its lifetime and provides * a global identifier that can be assumed unique. If *ctx* is * NULL, then the helper returns the cookie for the initial * network namespace. The cookie itself is very similar to that * of **bpf_get_socket_cookie**\ () helper, but for network * namespaces instead of sockets. * Return * A 8-byte long opaque number. * * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level) * Description * Return id of cgroup v2 that is ancestor of the cgroup associated * with the current task at the *ancestor_level*. The root cgroup * is at *ancestor_level* zero and each step down the hierarchy * increments the level. If *ancestor_level* == level of cgroup * associated with the current task, then return value will be the * same as that of **bpf_get_current_cgroup_id**\ (). * * The helper is useful to implement policies based on cgroups * that are upper in hierarchy than immediate cgroup associated * with the current task. * * The format of returned id and helper limitations are same as in * **bpf_get_current_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. * * long bpf_sk_assign(struct sk_buff *skb, void *sk, u64 flags) * Description * Helper is overloaded depending on BPF program type. This * description applies to **BPF_PROG_TYPE_SCHED_CLS** and * **BPF_PROG_TYPE_SCHED_ACT** programs. * * Assign the *sk* to the *skb*. When combined with appropriate * routing configuration to receive the packet towards the socket, * will cause *skb* to be delivered to the specified socket. * Subsequent redirection of *skb* via **bpf_redirect**\ (), * **bpf_clone_redirect**\ () or other methods outside of BPF may * interfere with successful delivery to the socket. * * This operation is only valid from TC ingress path. * * The *flags* argument must be zero. * Return * 0 on success, or a negative error in case of failure: * * **-EINVAL** if specified *flags* are not supported. * * **-ENOENT** if the socket is unavailable for assignment. * * **-ENETUNREACH** if the socket is unreachable (wrong netns). * * **-EOPNOTSUPP** if the operation is not supported, for example * a call from outside of TC ingress. * * **-ESOCKTNOSUPPORT** if the socket type is not supported * (reuseport). * * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) * Description * Helper is overloaded depending on BPF program type. This * description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs. * * Select the *sk* as a result of a socket lookup. * * For the operation to succeed passed socket must be compatible * with the packet description provided by the *ctx* object. * * L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must * be an exact match. While IP family (**AF_INET** or * **AF_INET6**) must be compatible, that is IPv6 sockets * that are not v6-only can be selected for IPv4 packets. * * Only TCP listeners and UDP unconnected sockets can be * selected. *sk* can also be NULL to reset any previous * selection. * * *flags* argument can combination of following values: * * * **BPF_SK_LOOKUP_F_REPLACE** to override the previous * socket selection, potentially done by a BPF program * that ran before us. * * * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip * load-balancing within reuseport group for the socket * being selected. * * On success *ctx->sk* will point to the selected socket. * * Return * 0 on success, or a negative errno in case of failure. * * * **-EAFNOSUPPORT** if socket family (*sk->family*) is * not compatible with packet family (*ctx->family*). * * * **-EEXIST** if socket has been already selected, * potentially by another program, and * **BPF_SK_LOOKUP_F_REPLACE** flag was not specified. * * * **-EINVAL** if unsupported flags were specified. * * * **-EPROTOTYPE** if socket L4 protocol * (*sk->protocol*) doesn't match packet protocol * (*ctx->protocol*). * * * **-ESOCKTNOSUPPORT** if socket is not in allowed * state (TCP listening or UDP unconnected). * * u64 bpf_ktime_get_boot_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. * Does include the time the system was suspended. * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) * Return * Current *ktime*. * * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) * Description * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print * out the format string. * The *m* represents the seq_file. The *fmt* and *fmt_size* are for * the format string itself. The *data* and *data_len* are format string * arguments. The *data* are a **u64** array and corresponding format string * values are stored in the array. For strings and pointers where pointees * are accessed, only the pointer values are stored in the *data* array. * The *data_len* is the size of *data* in bytes - must be a multiple of 8. * * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. * Reading kernel memory may fail due to either invalid address or * valid address but requiring a major memory fault. If reading kernel memory * fails, the string for **%s** will be an empty string, and the ip * address for **%p{i,I}{4,6}** will be 0. Not returning error to * bpf program is consistent with what **bpf_trace_printk**\ () does for now. * Return * 0 on success, or a negative error in case of failure: * * **-EBUSY** if per-CPU memory copy buffer is busy, can try again * by returning 1 from bpf program. * * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. * * **-E2BIG** if *fmt* contains too many format specifiers. * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * * long bpf_seq_write(struct seq_file *m, const void *data, u32 len) * Description * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. * The *m* represents the seq_file. The *data* and *len* represent the * data to write in bytes. * Return * 0 on success, or a negative error in case of failure: * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * * u64 bpf_sk_cgroup_id(void *sk) * Description * Return the cgroup v2 id of the socket *sk*. * * *sk* must be a non-**NULL** pointer to a socket, e.g. one * returned from **bpf_sk_lookup_xxx**\ (), * **bpf_sk_fullsock**\ (), etc. The format of returned id is * same as in **bpf_skb_cgroup_id**\ (). * * This helper is available only if the kernel was compiled with * the **CONFIG_SOCK_CGROUP_DATA** configuration option. * Return * The id is returned or 0 in case the id could not be retrieved. * * u64 bpf_sk_ancestor_cgroup_id(void *sk, int ancestor_level) * Description * Return id of cgroup v2 that is ancestor of cgroup associated * with the *sk* at the *ancestor_level*. The root cgroup is at * *ancestor_level* zero and each step down the hierarchy * increments the level. If *ancestor_level* == level of cgroup * associated with *sk*, then return value will be same as that * of **bpf_sk_cgroup_id**\ (). * * The helper is useful to implement policies based on cgroups * that are upper in hierarchy than immediate cgroup associated * with *sk*. * * The format of returned id and helper limitations are same as in * **bpf_sk_cgroup_id**\ (). * Return * The id is returned or 0 in case the id could not be retrieved. * * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. * If **0** is specified in *flags*, an adaptive notification * of new data availability is sent. * * An adaptive notification is a notification sent whenever the user-space * process has caught up and consumed all available payloads. In case the user-space * process is still processing a previous payload, then no notification is needed * as it will process the newly added payload automatically. * Return * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description * Reserve *size* bytes of payload in a ring buffer *ringbuf*. * *flags* must be 0. * Return * Valid pointer with *size* bytes of memory available; NULL, * otherwise. * * void bpf_ringbuf_submit(void *data, u64 flags) * Description * Submit reserved ring buffer sample, pointed to by *data*. * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. * If **0** is specified in *flags*, an adaptive notification * of new data availability is sent. * * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * * void bpf_ringbuf_discard(void *data, u64 flags) * Description * Discard reserved ring buffer sample, pointed to by *data*. * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. * If **0** is specified in *flags*, an adaptive notification * of new data availability is sent. * * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) * Description * Query various characteristics of provided ring buffer. What * exactly is queries is determined by *flags*: * * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. * * **BPF_RB_RING_SIZE**: The size of ring buffer. * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). * * Data returned is just a momentary snapshot of actual values * and could be inaccurate, so this facility should be used to * power heuristics and for reporting, not to make 100% correct * calculation. * Return * Requested value, or 0, if *flags* are not recognized. * * long bpf_csum_level(struct sk_buff *skb, u64 level) * Description * Change the skbs checksum level by one layer up or down, or * reset it entirely to none in order to have the stack perform * checksum validation. The level is applicable to the following * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | * through **bpf_skb_adjust_room**\ () helper with passing in * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since * the UDP header is removed. Similarly, an encap of the latter * into the former could be accompanied by a helper call to * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the * skb is still intended to be processed in higher layers of the * stack instead of just egressing at tc. * * There are three supported level settings at this time: * * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs * with CHECKSUM_UNNECESSARY. * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs * with CHECKSUM_UNNECESSARY. * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and * sets CHECKSUM_NONE to force checksum validation by the stack. * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current * skb->csum_level. * Return * 0 on success, or a negative error in case of failure. In the * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level * is returned or the error code -EACCES in case the skb is not * subject to CHECKSUM_UNNECESSARY. * * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. * Return * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. * Return * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return * *sk* if casting is valid, or **NULL** otherwise. * * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return * *sk* if casting is valid, or **NULL** otherwise. * * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *task*, which is a valid * pointer to **struct task_struct**. To store the stacktrace, the * bpf program provides *buf* with a nonnegative *size*. * * The last argument, *flags*, holds the number of stack frames to * skip (from 0 to 255), masked with * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set * the following flags: * * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. * **BPF_F_USER_BUILD_ID** * Collect buildid+offset instead of ips for user stack, * only valid if **BPF_F_USER_STACK** is also specified. * * **bpf_get_task_stack**\ () can collect up to * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject * to sufficient large buffer size. Note that * this limit can be controlled with the **sysctl** program, and * that it should be manually increased in order to profile long * user stacks (such as stacks for Java programs). To do so, use: * * :: * * # sysctl kernel.perf_event_max_stack= * Return * The non-negative copied *buf* length equal to or less than * *size* on success, or a negative error in case of failure. * * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) * Description * Load header option. Support reading a particular TCP header * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). * * If *flags* is 0, it will search the option from the * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** * has details on what skb_data contains under different * *skops*\ **->op**. * * The first byte of the *searchby_res* specifies the * kind that it wants to search. * * If the searching kind is an experimental kind * (i.e. 253 or 254 according to RFC6994). It also * needs to specify the "magic" which is either * 2 bytes or 4 bytes. It then also needs to * specify the size of the magic by using * the 2nd byte which is "kind-length" of a TCP * header option and the "kind-length" also * includes the first 2 bytes "kind" and "kind-length" * itself as a normal TCP header option also does. * * For example, to search experimental kind 254 with * 2 byte magic 0xeB9F, the searchby_res should be * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. * * To search for the standard window scale option (3), * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. * Note, kind-length must be 0 for regular option. * * Searching for No-Op (0) and End-of-Option-List (1) are * not supported. * * *len* must be at least 2 bytes which is the minimal size * of a header option. * * Supported flags: * * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the * saved_syn packet or the just-received syn packet. * * Return * > 0 when found, the header option is copied to *searchby_res*. * The return value is the total length copied. On failure, a * negative error code is returned: * * **-EINVAL** if a parameter is invalid. * * **-ENOMSG** if the option is not found. * * **-ENOENT** if no syn packet is available when * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. * * **-ENOSPC** if there is not enough space. Only *len* number of * bytes are copied. * * **-EFAULT** on failure to parse the header options in the * packet. * * **-EPERM** if the helper cannot be used under the current * *skops*\ **->op**. * * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) * Description * Store header option. The data will be copied * from buffer *from* with length *len* to the TCP header. * * The buffer *from* should have the whole option that * includes the kind, kind-length, and the actual * option data. The *len* must be at least kind-length * long. The kind-length does not have to be 4 byte * aligned. The kernel will take care of the padding * and setting the 4 bytes aligned value to th->doff. * * This helper will check for duplicated option * by searching the same option in the outgoing skb. * * This helper can only be called during * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * * Return * 0 on success, or negative error in case of failure: * * **-EINVAL** If param is invalid. * * **-ENOSPC** if there is not enough space in the header. * Nothing has been written * * **-EEXIST** if the option already exists. * * **-EFAULT** on failure to parse the existing header options. * * **-EPERM** if the helper cannot be used under the current * *skops*\ **->op**. * * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) * Description * Reserve *len* bytes for the bpf header option. The * space will be used by **bpf_store_hdr_opt**\ () later in * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * * If **bpf_reserve_hdr_opt**\ () is called multiple times, * the total number of bytes will be reserved. * * This helper can only be called during * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. * * Return * 0 on success, or negative error in case of failure: * * **-EINVAL** if a parameter is invalid. * * **-ENOSPC** if there is not enough space in the header. * * **-EPERM** if the helper cannot be used under the current * *skops*\ **->op**. * * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) * Description * Get a bpf_local_storage from an *inode*. * * Logically, it could be thought of as getting the value from * a *map* with *inode* as the **key**. From this * perspective, the usage is not much different from * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this * helper enforces the key must be an inode and the map must also * be a **BPF_MAP_TYPE_INODE_STORAGE**. * * Underneath, the value is stored locally at *inode* instead of * the *map*. The *map* is used as the bpf-local-storage * "type". The bpf-local-storage "type" (i.e. the *map*) is * searched against all bpf_local_storage residing at *inode*. * * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be * used such that a new bpf_local_storage will be * created if one does not exist. *value* can be used * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify * the initial value of a bpf_local_storage. If *value* is * **NULL**, the new bpf_local_storage will be zero initialized. * Return * A bpf_local_storage pointer is returned on success. * * **NULL** if not found or there was an error in adding * a new bpf_local_storage. * * int bpf_inode_storage_delete(struct bpf_map *map, void *inode) * Description * Delete a bpf_local_storage from an *inode*. * Return * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. * * long bpf_d_path(struct path *path, char *buf, u32 sz) * Description * Return full path for given **struct path** object, which * needs to be the kernel BTF *path* object. The path is * returned in the provided buffer *buf* of size *sz* and * is zero terminated. * * Return * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative * value. * * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) * Description * Read *size* bytes from user space address *user_ptr* and store * the data in *dst*. This is a wrapper of **copy_from_user**\ (). * Return * 0 on success, or a negative error in case of failure. * * long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags) * Description * Use BTF to store a string representation of *ptr*->ptr in *str*, * using *ptr*->type_id. This value should specify the type * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1) * can be used to look up vmlinux BTF type ids. Traversing the * data structure using BTF, the type information and values are * stored in the first *str_size* - 1 bytes of *str*. Safe copy of * the pointer data is carried out to avoid kernel crashes during * operation. Smaller types can use string space on the stack; * larger programs can use map data to store the string * representation. * * The string can be subsequently shared with userspace via * bpf_perf_event_output() or ring buffer interfaces. * bpf_trace_printk() is to be avoided as it places too small * a limit on string size to be useful. * * *flags* is a combination of * * **BTF_F_COMPACT** * no formatting around type information * **BTF_F_NONAME** * no struct/union member names/types * **BTF_F_PTR_RAW** * show raw (unobfuscated) pointer values; * equivalent to printk specifier %px. * **BTF_F_ZERO** * show zero-valued struct/union members; they * are not displayed by default * * Return * The number of bytes that were written (or would have been * written if output had to be truncated due to string size), * or a negative error in cases of failure. * * long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 ptr_size, u64 flags) * Description * Use BTF to write to seq_write a string representation of * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf(). * *flags* are identical to those used for bpf_snprintf_btf. * Return * 0 on success or a negative error in case of failure. * * u64 bpf_skb_cgroup_classid(struct sk_buff *skb) * Description * See **bpf_get_cgroup_classid**\ () for the main description. * This helper differs from **bpf_get_cgroup_classid**\ () in that * the cgroup v1 net_cls class is retrieved only from the *skb*'s * associated socket instead of the current process. * Return * The id is returned or 0 in case the id could not be retrieved. * * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) * Description * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it * populates L2 addresses as well, meaning, internally, the helper * relies on the neighbor lookup for the L2 address of the nexthop. * * The helper will perform a FIB lookup based on the skb's * networking header to get the address of the next hop, unless * this is supplied by the caller in the *params* argument. The * *plen* argument indicates the len of *params* and should be set * to 0 if *params* is NULL. * * The *flags* argument is reserved and must be 0. The helper is * currently only supported for tc BPF program types, and enabled * for IPv4 and IPv6 protocols. * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. * * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) * Description * Take a pointer to a percpu ksym, *percpu_ptr*, and return a * pointer to the percpu kernel variable on *cpu*. A ksym is an * extern variable decorated with '__ksym'. For ksym, there is a * global var (either static or global) defined of the same name * in the kernel. The ksym is percpu if the global var is percpu. * The returned pointer points to the global percpu var on *cpu*. * * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the * kernel, except that bpf_per_cpu_ptr() may return NULL. This * happens if *cpu* is larger than nr_cpu_ids. The caller of * bpf_per_cpu_ptr() must check the returned value. * Return * A pointer pointing to the kernel percpu variable on *cpu*, or * NULL, if *cpu* is invalid. * * void *bpf_this_cpu_ptr(const void *percpu_ptr) * Description * Take a pointer to a percpu ksym, *percpu_ptr*, and return a * pointer to the percpu kernel variable on this cpu. See the * description of 'ksym' in **bpf_per_cpu_ptr**\ (). * * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would * never return NULL. * Return * A pointer pointing to the kernel percpu variable on this cpu. * * long bpf_redirect_peer(u32 ifindex, u64 flags) * Description * Redirect the packet to another net device of index *ifindex*. * This helper is somewhat similar to **bpf_redirect**\ (), except * that the redirection happens to the *ifindex*' peer device and * the netns switch takes place from ingress to ingress without * going through the CPU's backlog queue. * * The *flags* argument is reserved and must be 0. The helper is * currently only supported for tc BPF program types at the ingress * hook and for veth device types. The peer device must reside in a * different network namespace. * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. * * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags) * Description * Get a bpf_local_storage from the *task*. * * Logically, it could be thought of as getting the value from * a *map* with *task* as the **key**. From this * perspective, the usage is not much different from * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this * helper enforces the key must be a task_struct and the map must also * be a **BPF_MAP_TYPE_TASK_STORAGE**. * * Underneath, the value is stored locally at *task* instead of * the *map*. The *map* is used as the bpf-local-storage * "type". The bpf-local-storage "type" (i.e. the *map*) is * searched against all bpf_local_storage residing at *task*. * * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be * used such that a new bpf_local_storage will be * created if one does not exist. *value* can be used * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify * the initial value of a bpf_local_storage. If *value* is * **NULL**, the new bpf_local_storage will be zero initialized. * Return * A bpf_local_storage pointer is returned on success. * * **NULL** if not found or there was an error in adding * a new bpf_local_storage. * * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task) * Description * Delete a bpf_local_storage from a *task*. * Return * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. * * struct task_struct *bpf_get_current_task_btf(void) * Description * Return a BTF pointer to the "current" task. * This pointer can also be used in helpers that accept an * *ARG_PTR_TO_BTF_ID* of type *task_struct*. * Return * Pointer to the current task. * * long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags) * Description * Set or clear certain options on *bprm*: * * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit * which sets the **AT_SECURE** auxv for glibc. The bit * is cleared if the flag is not specified. * Return * **-EINVAL** if invalid *flags* are passed, zero otherwise. * * u64 bpf_ktime_get_coarse_ns(void) * Description * Return a coarse-grained version of the time elapsed since * system boot, in nanoseconds. Does not include time the system * was suspended. * * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) * Return * Current *ktime*. * * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) * Description * Returns the stored IMA hash of the *inode* (if it's available). * If the hash is larger than *size*, then only *size* * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if * invalid arguments are passed. * * struct socket *bpf_sock_from_file(struct file *file) * Description * If the given file represents a socket, returns the associated * socket. * Return * A pointer to a struct socket on success or NULL if the file is * not a socket. * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description * Check packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. * * The argument *len_diff* can be used for querying with a planned * size change. This allows to check MTU prior to changing packet * ctx. Providing a *len_diff* adjustment that is larger than the * actual packet size (resulting in negative packet size) will in * principle not exceed the MTU, which is why it is not considered * a failure. Other BPF helpers are needed for performing the * planned size change; therefore the responsibility for catching * a negative packet size belongs in those helpers. * * Specifying *ifindex* zero means the MTU check is performed * against the current net device. This is practical if this isn't * used prior to redirect. * * On input *mtu_len* must be a valid pointer, else verifier will * reject BPF program. If the value *mtu_len* is initialized to * zero then the ctx packet size is use. When value *mtu_len* is * provided as input this specify the L3 length that the MTU check * is done against. Remember XDP and TC length operate at L2, but * this value is L3 as this correlate to MTU and IP-header tot_len * values which are L3 (similar behavior as bpf_fib_lookup). * * The Linux kernel route table can configure MTUs on a more * specific per route level, which is not provided by this helper. * For route level MTU checks use the **bpf_fib_lookup**\ () * helper. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** for tc cls_act programs. * * The *flags* argument can be a combination of one or more of the * following values: * * **BPF_MTU_CHK_SEGS** * This flag will only works for *ctx* **struct sk_buff**. * If packet context contains extra packet segment buffers * (often knows as GSO skb), then MTU check is harder to * check at this point, because in transmit path it is * possible for the skb packet to get re-segmented * (depending on net device features). This could still be * a MTU violation, so this flag enables performing MTU * check against segments, with a different violation * return code to tell it apart. Check cannot use len_diff. * * On return *mtu_len* pointer contains the MTU value of the net * device. Remember the net device configured MTU is the L3 size, * which is returned here and XDP and TC length operate at L2. * Helper take this into account for you, but remember when using * MTU value in your BPF-code. * * Return * * 0 on success, and populate MTU value in *mtu_len* pointer. * * * < 0 if any input argument is invalid (*mtu_len* not updated) * * MTU violations return positive values, but also populate MTU * value in *mtu_len* pointer, as this can be needed for * implementing PMTU handing: * * * **BPF_MTU_CHK_RET_FRAG_NEEDED** * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** * * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) * Description * For each element in **map**, call **callback_fn** function with * **map**, **callback_ctx** and other map-specific parameters. * The **callback_fn** should be a static function and * the **callback_ctx** should be a pointer to the stack. * The **flags** is used to control certain aspects of the helper. * Currently, the **flags** must be 0. * * The following are a list of supported map types and their * respective expected callback signatures: * * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY * * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); * * For per_cpu maps, the map_value is the value on the cpu where the * bpf_prog is running. * * If **callback_fn** return 0, the helper will continue to the next * element. If return value is 1, the helper will skip the rest of * elements and return. Other return values are not used now. * * Return * The number of traversed map elements for success, **-EINVAL** for * invalid **flags**. * * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) * Description * Outputs a string into the **str** buffer of size **str_size** * based on a format string stored in a read-only map pointed by * **fmt**. * * Each format specifier in **fmt** corresponds to one u64 element * in the **data** array. For strings and pointers where pointees * are accessed, only the pointer values are stored in the *data* * array. The *data_len* is the size of *data* in bytes - must be * a multiple of 8. * * Formats **%s** and **%p{i,I}{4,6}** require to read kernel * memory. Reading kernel memory may fail due to either invalid * address or valid address but requiring a major memory fault. If * reading kernel memory fails, the string for **%s** will be an * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. * Not returning error to bpf program is consistent with what * **bpf_trace_printk**\ () does for now. * * Return * The strictly positive length of the formatted string, including * the trailing zero character. If the return value is greater than * **str_size**, **str** contains a truncated string, guaranteed to * be zero-terminated except when **str_size** is 0. * * Or **-EBUSY** if the per-CPU memory copy buffer is busy. * * long bpf_sys_bpf(u32 cmd, void *attr, u32 attr_size) * Description * Execute bpf syscall with given arguments. * Return * A syscall result. * * long bpf_btf_find_by_name_kind(char *name, int name_sz, u32 kind, int flags) * Description * Find BTF type with given name and kind in vmlinux BTF or in module's BTFs. * Return * Returns btf_id and btf_obj_fd in lower and upper 32 bits. * * long bpf_sys_close(u32 fd) * Description * Execute close syscall for given FD. * Return * A syscall result. * * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags) * Description * Initialize the timer. * First 4 bits of *flags* specify clockid. * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. * All other bits of *flags* are reserved. * The verifier will reject the program if *timer* is not from * the same *map*. * Return * 0 on success. * **-EBUSY** if *timer* is already initialized. * **-EINVAL** if invalid *flags* are passed. * **-EPERM** if *timer* is in a map that doesn't have any user references. * The user space should either hold a file descriptor to a map with timers * or pin such map in bpffs. When map is unpinned or file descriptor is * closed all timers in the map will be cancelled and freed. * * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn) * Description * Configure the timer to call *callback_fn* static function. * Return * 0 on success. * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. * **-EPERM** if *timer* is in a map that doesn't have any user references. * The user space should either hold a file descriptor to a map with timers * or pin such map in bpffs. When map is unpinned or file descriptor is * closed all timers in the map will be cancelled and freed. * * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags) * Description * Set timer expiration N nanoseconds from the current time. The * configured callback will be invoked in soft irq context on some cpu * and will not repeat unless another bpf_timer_start() is made. * In such case the next invocation can migrate to a different cpu. * Since struct bpf_timer is a field inside map element the map * owns the timer. The bpf_timer_set_callback() will increment refcnt * of BPF program to make sure that callback_fn code stays valid. * When user space reference to a map reaches zero all timers * in a map are cancelled and corresponding program's refcnts are * decremented. This is done to make sure that Ctrl-C of a user * process doesn't leave any timers running. If map is pinned in * bpffs the callback_fn can re-arm itself indefinitely. * bpf_map_update/delete_elem() helpers and user space sys_bpf commands * cancel and free the timer in the given map element. * The map can contain timers that invoke callback_fn-s from different * programs. The same callback_fn can serve different timers from * different maps if key/value layout matches across maps. * Every bpf_timer_set_callback() can have different callback_fn. * * Return * 0 on success. * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier * or invalid *flags* are passed. * * long bpf_timer_cancel(struct bpf_timer *timer) * Description * Cancel the timer and wait for callback_fn to finish if it was running. * Return * 0 if the timer was not active. * 1 if the timer was active. * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its * own timer which would have led to a deadlock otherwise. * * u64 bpf_get_func_ip(void *ctx) * Description * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. * 0 for kprobes placed within the function (not at the entry). * * u64 bpf_get_attach_cookie(void *ctx) * Description * Get bpf_cookie value provided (optionally) during the program * attachment. It might be different for each individual * attachment, even if BPF program itself is the same. * Expects BPF program context *ctx* as a first argument. * * Supported for the following program types: * - kprobe/uprobe; * - tracepoint; * - perf_event. * Return * Value specified by user at BPF link creation/attachment time * or 0, if it was not specified. * * long bpf_task_pt_regs(struct task_struct *task) * Description * Get the struct pt_regs associated with **task**. * Return * A pointer to struct pt_regs. * * long bpf_get_branch_snapshot(void *entries, u32 size, u64 flags) * Description * Get branch trace from hardware engines like Intel LBR. The * hardware engine is stopped shortly after the helper is * called. Therefore, the user need to filter branch entries * based on the actual use case. To capture branch trace * before the trigger point of the BPF program, the helper * should be called at the beginning of the BPF program. * * The data is stored as struct perf_branch_entry into output * buffer *entries*. *size* is the size of *entries* in bytes. * *flags* is reserved for now and must be zero. * * Return * On success, number of bytes written to *buf*. On error, a * negative value. * * **-EINVAL** if *flags* is not zero. * * **-ENOENT** if architecture does not support branch records. * * long bpf_trace_vprintk(const char *fmt, u32 fmt_size, const void *data, u32 data_len) * Description * Behaves like **bpf_trace_printk**\ () helper, but takes an array of u64 * to format and can handle more format args as a result. * * Arguments are to be used as in **bpf_seq_printf**\ () helper. * Return * The number of bytes written to the buffer, or a negative error * in case of failure. * * struct unix_sock *bpf_skc_to_unix_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *unix_sock* pointer. * Return * *sk* if casting is valid, or **NULL** otherwise. * * long bpf_kallsyms_lookup_name(const char *name, int name_sz, int flags, u64 *res) * Description * Get the address of a kernel symbol, returned in *res*. *res* is * set to 0 if the symbol is not found. * Return * On success, zero. On error, a negative value. * * **-EINVAL** if *flags* is not zero. * * **-EINVAL** if string *name* is not the same size as *name_sz*. * * **-ENOENT** if symbol is not found. * * **-EPERM** if caller does not have permission to obtain kernel address. * * long bpf_find_vma(struct task_struct *task, u64 addr, void *callback_fn, void *callback_ctx, u64 flags) * Description * Find vma of *task* that contains *addr*, call *callback_fn* * function with *task*, *vma*, and *callback_ctx*. * The *callback_fn* should be a static function and * the *callback_ctx* should be a pointer to the stack. * The *flags* is used to control certain aspects of the helper. * Currently, the *flags* must be 0. * * The expected callback signature is * * long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*callback_ctx); * * Return * 0 on success. * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. * **-EBUSY** if failed to try lock mmap_lock. * **-EINVAL** for invalid **flags**. * * long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags) * Description * For **nr_loops**, call **callback_fn** function * with **callback_ctx** as the context parameter. * The **callback_fn** should be a static function and * the **callback_ctx** should be a pointer to the stack. * The **flags** is used to control certain aspects of the helper. * Currently, the **flags** must be 0. Currently, nr_loops is * limited to 1 << 23 (~8 million) loops. * * long (\*callback_fn)(u32 index, void \*ctx); * * where **index** is the current index in the loop. The index * is zero-indexed. * * If **callback_fn** returns 0, the helper will continue to the next * loop. If return value is 1, the helper will skip the rest of * the loops and return. Other return values are not used now, * and will be rejected by the verifier. * * Return * The number of loops performed, **-EINVAL** for invalid **flags**, * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. * * long bpf_strncmp(const char *s1, u32 s1_sz, const char *s2) * Description * Do strncmp() between **s1** and **s2**. **s1** doesn't need * to be null-terminated and **s1_sz** is the maximum storage * size of **s1**. **s2** must be a read-only string. * Return * An integer less than, equal to, or greater than zero * if the first **s1_sz** bytes of **s1** is found to be * less than, to match, or be greater than **s2**. * * long bpf_get_func_arg(void *ctx, u32 n, u64 *value) * Description * Get **n**-th argument register (zero based) of the traced function (for tracing programs) * returned in **value**. * * Return * 0 on success. * **-EINVAL** if n >= argument register count of traced function. * * long bpf_get_func_ret(void *ctx, u64 *value) * Description * Get return value of the traced function (for tracing programs) * in **value**. * * Return * 0 on success. * **-EOPNOTSUPP** for tracing programs other than BPF_TRACE_FEXIT or BPF_MODIFY_RETURN. * * long bpf_get_func_arg_cnt(void *ctx) * Description * Get number of registers of the traced function (for tracing programs) where * function arguments are stored in these registers. * * Return * The number of argument registers of the traced function. * * int bpf_get_retval(void) * Description * Get the BPF program's return value that will be returned to the upper layers. * * This helper is currently supported by cgroup programs and only by the hooks * where BPF program's return value is returned to the userspace via errno. * Return * The BPF program's return value. * * int bpf_set_retval(int retval) * Description * Set the BPF program's return value that will be returned to the upper layers. * * This helper is currently supported by cgroup programs and only by the hooks * where BPF program's return value is returned to the userspace via errno. * * Note that there is the following corner case where the program exports an error * via bpf_set_retval but signals success via 'return 1': * * bpf_set_retval(-EPERM); * return 1; * * In this case, the BPF program's return value will use helper's -EPERM. This * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. * * Return * 0 on success, or a negative error in case of failure. * * u64 bpf_xdp_get_buff_len(struct xdp_buff *xdp_md) * Description * Get the total size of a given xdp buff (linear and paged area) * Return * The total size of a given xdp buffer. * * long bpf_xdp_load_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len) * Description * This helper is provided as an easy way to load data from a * xdp buffer. It can be used to load *len* bytes from *offset* from * the frame associated to *xdp_md*, into the buffer pointed by * *buf*. * Return * 0 on success, or a negative error in case of failure. * * long bpf_xdp_store_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len) * Description * Store *len* bytes from buffer *buf* into the frame * associated to *xdp_md*, at *offset*. * Return * 0 on success, or a negative error in case of failure. * * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags) * Description * Read *size* bytes from user space address *user_ptr* in *tsk*'s * address space, and stores the data in *dst*. *flags* is not * used yet and is provided for future extensibility. This helper * can only be used by sleepable programs. * Return * 0 on success, or a negative error in case of failure. On error * *dst* buffer is zeroed out. * * long bpf_skb_set_tstamp(struct sk_buff *skb, u64 tstamp, u32 tstamp_type) * Description * Change the __sk_buff->tstamp_type to *tstamp_type* * and set *tstamp* to the __sk_buff->tstamp together. * * If there is no need to change the __sk_buff->tstamp_type, * the tstamp value can be directly written to __sk_buff->tstamp * instead. * * BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that * will be kept during bpf_redirect_*(). A non zero * *tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO * *tstamp_type*. * * A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used * with a zero *tstamp*. * * Only IPv4 and IPv6 skb->protocol are supported. * * This function is most useful when it needs to set a * mono delivery time to __sk_buff->tstamp and then * bpf_redirect_*() to the egress of an iface. For example, * changing the (rcv) timestamp in __sk_buff->tstamp at * ingress to a mono delivery time and then bpf_redirect_*() * to sch_fq@phy-dev. * Return * 0 on success. * **-EINVAL** for invalid input * **-EOPNOTSUPP** for unsupported protocol * * long bpf_ima_file_hash(struct file *file, void *dst, u32 size) * Description * Returns a calculated IMA hash of the *file*. * If the hash is larger than *size*, then only *size* * bytes will be copied to *dst* * Return * The **hash_algo** is returned on success, * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if * invalid arguments are passed. * * void *bpf_kptr_xchg(void *map_value, void *ptr) * Description * Exchange kptr at pointer *map_value* with *ptr*, and return the * old value. *ptr* can be NULL, otherwise it must be a referenced * pointer which will be released when this helper is called. * Return * The old value of kptr (which can be NULL). The returned pointer * if not NULL, is a reference which must be released using its * corresponding release function, or moved into a BPF map before * program exit. * * void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu) * Description * Perform a lookup in *percpu map* for an entry associated to * *key* on *cpu*. * Return * Map value associated to *key* on *cpu*, or **NULL** if no entry * was found or *cpu* is invalid. * * struct mptcp_sock *bpf_skc_to_mptcp_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. * Return * *sk* if casting is valid, or **NULL** otherwise. * * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) * Description * Get a dynptr to local memory *data*. * * *data* must be a ptr to a map value. * The maximum *size* supported is DYNPTR_MAX_SIZE. * *flags* is currently unused. * Return * 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, * -EINVAL if flags is not 0. * * long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr) * Description * Reserve *size* bytes of payload in a ring buffer *ringbuf* * through the dynptr interface. *flags* must be 0. * * Please note that a corresponding bpf_ringbuf_submit_dynptr or * bpf_ringbuf_discard_dynptr must be called on *ptr*, even if the * reservation fails. This is enforced by the verifier. * Return * 0 on success, or a negative error in case of failure. * * void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags) * Description * Submit reserved ring buffer sample, pointed to by *data*, * through the dynptr interface. This is a no-op if the dynptr is * invalid/null. * * For more information on *flags*, please see * 'bpf_ringbuf_submit'. * Return * Nothing. Always succeeds. * * void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags) * Description * Discard reserved ring buffer sample through the dynptr * interface. This is a no-op if the dynptr is invalid/null. * * For more information on *flags*, please see * 'bpf_ringbuf_discard'. * Return * Nothing. Always succeeds. * * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. * *flags* is currently unused. * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. * *flags* is currently unused. * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* * is a read-only dynptr or if *flags* is not 0. * * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) * Description * Get a pointer to the underlying dynptr data. * * *len* must be a statically known value. The returned data slice * is invalidated whenever the dynptr is invalidated. * Return * Pointer to the underlying dynptr data, NULL if the dynptr is * read-only, if the dynptr is invalid, or if the offset and length * is out of bounds. * * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len) * Description * Try to issue a SYN cookie for the packet with corresponding * IPv4/TCP headers, *iph* and *th*, without depending on a * listening socket. * * *iph* points to the IPv4 header. * * *th* points to the start of the TCP header, while *th_len* * contains the length of the TCP header (at least * **sizeof**\ (**struct tcphdr**)). * Return * On success, lower 32 bits hold the generated SYN cookie in * followed by 16 bits which hold the MSS value for that cookie, * and the top 16 bits are unused. * * On failure, the returned value is one of the following: * * **-EINVAL** if *th_len* is invalid. * * s64 bpf_tcp_raw_gen_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th, u32 th_len) * Description * Try to issue a SYN cookie for the packet with corresponding * IPv6/TCP headers, *iph* and *th*, without depending on a * listening socket. * * *iph* points to the IPv6 header. * * *th* points to the start of the TCP header, while *th_len* * contains the length of the TCP header (at least * **sizeof**\ (**struct tcphdr**)). * Return * On success, lower 32 bits hold the generated SYN cookie in * followed by 16 bits which hold the MSS value for that cookie, * and the top 16 bits are unused. * * On failure, the returned value is one of the following: * * **-EINVAL** if *th_len* is invalid. * * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. * * long bpf_tcp_raw_check_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK * without depending on a listening socket. * * *iph* points to the IPv4 header. * * *th* points to the TCP header. * Return * 0 if *iph* and *th* are a valid SYN cookie ACK. * * On failure, the returned value is one of the following: * * **-EACCES** if the SYN cookie is not valid. * * long bpf_tcp_raw_check_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK * without depending on a listening socket. * * *iph* points to the IPv6 header. * * *th* points to the TCP header. * Return * 0 if *iph* and *th* are a valid SYN cookie ACK. * * On failure, the returned value is one of the following: * * **-EACCES** if the SYN cookie is not valid. * * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. * * u64 bpf_ktime_get_tai_ns(void) * Description * A nonsettable system-wide clock derived from wall-clock time but * ignoring leap seconds. This clock does not experience * discontinuities and backwards jumps caused by NTP inserting leap * seconds as CLOCK_REALTIME does. * * See: **clock_gettime**\ (**CLOCK_TAI**) * Return * Current *ktime*. * * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) * Description * Drain samples from the specified user ring buffer, and invoke * the provided callback for each such sample: * * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); * * If **callback_fn** returns 0, the helper will continue to try * and drain the next sample, up to a maximum of * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, * the helper will skip the rest of the samples and return. Other * return values are not used now, and will be rejected by the * verifier. * Return * The number of drained samples if no error was encountered while * draining samples, or 0 if no samples were present in the ring * buffer. If a user-space producer was epoll-waiting on this map, * and at least one sample was drained, they will receive an event * notification notifying them of available space in the ring * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this * function, no wakeup notification will be sent. If the * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will * be sent even if no sample was drained. * * On failure, the returned value is one of the following: * * **-EBUSY** if the ring buffer is contended, and another calling * context was concurrently draining the ring buffer. * * **-EINVAL** if user-space is not properly tracking the ring * buffer due to the producer position not being aligned to 8 * bytes, a sample not being aligned to 8 bytes, or the producer * position not matching the advertised length of a sample. * * **-E2BIG** if user-space has tried to publish a sample which is * larger than the size of the ring buffer, or which cannot fit * within a struct bpf_dynptr. * * void *bpf_cgrp_storage_get(struct bpf_map *map, struct cgroup *cgroup, void *value, u64 flags) * Description * Get a bpf_local_storage from the *cgroup*. * * Logically, it could be thought of as getting the value from * a *map* with *cgroup* as the **key**. From this * perspective, the usage is not much different from * **bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this * helper enforces the key must be a cgroup struct and the map must also * be a **BPF_MAP_TYPE_CGRP_STORAGE**. * * In reality, the local-storage value is embedded directly inside of the * *cgroup* object itself, rather than being located in the * **BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is * queried for some *map* on a *cgroup* object, the kernel will perform an * O(n) iteration over all of the live local-storage values for that * *cgroup* object until the local-storage value for the *map* is found. * * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be * used such that a new bpf_local_storage will be * created if one does not exist. *value* can be used * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify * the initial value of a bpf_local_storage. If *value* is * **NULL**, the new bpf_local_storage will be zero initialized. * Return * A bpf_local_storage pointer is returned on success. * * **NULL** if not found or there was an error in adding * a new bpf_local_storage. * * long bpf_cgrp_storage_delete(struct bpf_map *map, struct cgroup *cgroup) * Description * Delete a bpf_local_storage from a *cgroup*. * Return * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ FN(map_lookup_elem, 1, ##ctx) \ FN(map_update_elem, 2, ##ctx) \ FN(map_delete_elem, 3, ##ctx) \ FN(probe_read, 4, ##ctx) \ FN(ktime_get_ns, 5, ##ctx) \ FN(trace_printk, 6, ##ctx) \ FN(get_prandom_u32, 7, ##ctx) \ FN(get_smp_processor_id, 8, ##ctx) \ FN(skb_store_bytes, 9, ##ctx) \ FN(l3_csum_replace, 10, ##ctx) \ FN(l4_csum_replace, 11, ##ctx) \ FN(tail_call, 12, ##ctx) \ FN(clone_redirect, 13, ##ctx) \ FN(get_current_pid_tgid, 14, ##ctx) \ FN(get_current_uid_gid, 15, ##ctx) \ FN(get_current_comm, 16, ##ctx) \ FN(get_cgroup_classid, 17, ##ctx) \ FN(skb_vlan_push, 18, ##ctx) \ FN(skb_vlan_pop, 19, ##ctx) \ FN(skb_get_tunnel_key, 20, ##ctx) \ FN(skb_set_tunnel_key, 21, ##ctx) \ FN(perf_event_read, 22, ##ctx) \ FN(redirect, 23, ##ctx) \ FN(get_route_realm, 24, ##ctx) \ FN(perf_event_output, 25, ##ctx) \ FN(skb_load_bytes, 26, ##ctx) \ FN(get_stackid, 27, ##ctx) \ FN(csum_diff, 28, ##ctx) \ FN(skb_get_tunnel_opt, 29, ##ctx) \ FN(skb_set_tunnel_opt, 30, ##ctx) \ FN(skb_change_proto, 31, ##ctx) \ FN(skb_change_type, 32, ##ctx) \ FN(skb_under_cgroup, 33, ##ctx) \ FN(get_hash_recalc, 34, ##ctx) \ FN(get_current_task, 35, ##ctx) \ FN(probe_write_user, 36, ##ctx) \ FN(current_task_under_cgroup, 37, ##ctx) \ FN(skb_change_tail, 38, ##ctx) \ FN(skb_pull_data, 39, ##ctx) \ FN(csum_update, 40, ##ctx) \ FN(set_hash_invalid, 41, ##ctx) \ FN(get_numa_node_id, 42, ##ctx) \ FN(skb_change_head, 43, ##ctx) \ FN(xdp_adjust_head, 44, ##ctx) \ FN(probe_read_str, 45, ##ctx) \ FN(get_socket_cookie, 46, ##ctx) \ FN(get_socket_uid, 47, ##ctx) \ FN(set_hash, 48, ##ctx) \ FN(setsockopt, 49, ##ctx) \ FN(skb_adjust_room, 50, ##ctx) \ FN(redirect_map, 51, ##ctx) \ FN(sk_redirect_map, 52, ##ctx) \ FN(sock_map_update, 53, ##ctx) \ FN(xdp_adjust_meta, 54, ##ctx) \ FN(perf_event_read_value, 55, ##ctx) \ FN(perf_prog_read_value, 56, ##ctx) \ FN(getsockopt, 57, ##ctx) \ FN(override_return, 58, ##ctx) \ FN(sock_ops_cb_flags_set, 59, ##ctx) \ FN(msg_redirect_map, 60, ##ctx) \ FN(msg_apply_bytes, 61, ##ctx) \ FN(msg_cork_bytes, 62, ##ctx) \ FN(msg_pull_data, 63, ##ctx) \ FN(bind, 64, ##ctx) \ FN(xdp_adjust_tail, 65, ##ctx) \ FN(skb_get_xfrm_state, 66, ##ctx) \ FN(get_stack, 67, ##ctx) \ FN(skb_load_bytes_relative, 68, ##ctx) \ FN(fib_lookup, 69, ##ctx) \ FN(sock_hash_update, 70, ##ctx) \ FN(msg_redirect_hash, 71, ##ctx) \ FN(sk_redirect_hash, 72, ##ctx) \ FN(lwt_push_encap, 73, ##ctx) \ FN(lwt_seg6_store_bytes, 74, ##ctx) \ FN(lwt_seg6_adjust_srh, 75, ##ctx) \ FN(lwt_seg6_action, 76, ##ctx) \ FN(rc_repeat, 77, ##ctx) \ FN(rc_keydown, 78, ##ctx) \ FN(skb_cgroup_id, 79, ##ctx) \ FN(get_current_cgroup_id, 80, ##ctx) \ FN(get_local_storage, 81, ##ctx) \ FN(sk_select_reuseport, 82, ##ctx) \ FN(skb_ancestor_cgroup_id, 83, ##ctx) \ FN(sk_lookup_tcp, 84, ##ctx) \ FN(sk_lookup_udp, 85, ##ctx) \ FN(sk_release, 86, ##ctx) \ FN(map_push_elem, 87, ##ctx) \ FN(map_pop_elem, 88, ##ctx) \ FN(map_peek_elem, 89, ##ctx) \ FN(msg_push_data, 90, ##ctx) \ FN(msg_pop_data, 91, ##ctx) \ FN(rc_pointer_rel, 92, ##ctx) \ FN(spin_lock, 93, ##ctx) \ FN(spin_unlock, 94, ##ctx) \ FN(sk_fullsock, 95, ##ctx) \ FN(tcp_sock, 96, ##ctx) \ FN(skb_ecn_set_ce, 97, ##ctx) \ FN(get_listener_sock, 98, ##ctx) \ FN(skc_lookup_tcp, 99, ##ctx) \ FN(tcp_check_syncookie, 100, ##ctx) \ FN(sysctl_get_name, 101, ##ctx) \ FN(sysctl_get_current_value, 102, ##ctx) \ FN(sysctl_get_new_value, 103, ##ctx) \ FN(sysctl_set_new_value, 104, ##ctx) \ FN(strtol, 105, ##ctx) \ FN(strtoul, 106, ##ctx) \ FN(sk_storage_get, 107, ##ctx) \ FN(sk_storage_delete, 108, ##ctx) \ FN(send_signal, 109, ##ctx) \ FN(tcp_gen_syncookie, 110, ##ctx) \ FN(skb_output, 111, ##ctx) \ FN(probe_read_user, 112, ##ctx) \ FN(probe_read_kernel, 113, ##ctx) \ FN(probe_read_user_str, 114, ##ctx) \ FN(probe_read_kernel_str, 115, ##ctx) \ FN(tcp_send_ack, 116, ##ctx) \ FN(send_signal_thread, 117, ##ctx) \ FN(jiffies64, 118, ##ctx) \ FN(read_branch_records, 119, ##ctx) \ FN(get_ns_current_pid_tgid, 120, ##ctx) \ FN(xdp_output, 121, ##ctx) \ FN(get_netns_cookie, 122, ##ctx) \ FN(get_current_ancestor_cgroup_id, 123, ##ctx) \ FN(sk_assign, 124, ##ctx) \ FN(ktime_get_boot_ns, 125, ##ctx) \ FN(seq_printf, 126, ##ctx) \ FN(seq_write, 127, ##ctx) \ FN(sk_cgroup_id, 128, ##ctx) \ FN(sk_ancestor_cgroup_id, 129, ##ctx) \ FN(ringbuf_output, 130, ##ctx) \ FN(ringbuf_reserve, 131, ##ctx) \ FN(ringbuf_submit, 132, ##ctx) \ FN(ringbuf_discard, 133, ##ctx) \ FN(ringbuf_query, 134, ##ctx) \ FN(csum_level, 135, ##ctx) \ FN(skc_to_tcp6_sock, 136, ##ctx) \ FN(skc_to_tcp_sock, 137, ##ctx) \ FN(skc_to_tcp_timewait_sock, 138, ##ctx) \ FN(skc_to_tcp_request_sock, 139, ##ctx) \ FN(skc_to_udp6_sock, 140, ##ctx) \ FN(get_task_stack, 141, ##ctx) \ FN(load_hdr_opt, 142, ##ctx) \ FN(store_hdr_opt, 143, ##ctx) \ FN(reserve_hdr_opt, 144, ##ctx) \ FN(inode_storage_get, 145, ##ctx) \ FN(inode_storage_delete, 146, ##ctx) \ FN(d_path, 147, ##ctx) \ FN(copy_from_user, 148, ##ctx) \ FN(snprintf_btf, 149, ##ctx) \ FN(seq_printf_btf, 150, ##ctx) \ FN(skb_cgroup_classid, 151, ##ctx) \ FN(redirect_neigh, 152, ##ctx) \ FN(per_cpu_ptr, 153, ##ctx) \ FN(this_cpu_ptr, 154, ##ctx) \ FN(redirect_peer, 155, ##ctx) \ FN(task_storage_get, 156, ##ctx) \ FN(task_storage_delete, 157, ##ctx) \ FN(get_current_task_btf, 158, ##ctx) \ FN(bprm_opts_set, 159, ##ctx) \ FN(ktime_get_coarse_ns, 160, ##ctx) \ FN(ima_inode_hash, 161, ##ctx) \ FN(sock_from_file, 162, ##ctx) \ FN(check_mtu, 163, ##ctx) \ FN(for_each_map_elem, 164, ##ctx) \ FN(snprintf, 165, ##ctx) \ FN(sys_bpf, 166, ##ctx) \ FN(btf_find_by_name_kind, 167, ##ctx) \ FN(sys_close, 168, ##ctx) \ FN(timer_init, 169, ##ctx) \ FN(timer_set_callback, 170, ##ctx) \ FN(timer_start, 171, ##ctx) \ FN(timer_cancel, 172, ##ctx) \ FN(get_func_ip, 173, ##ctx) \ FN(get_attach_cookie, 174, ##ctx) \ FN(task_pt_regs, 175, ##ctx) \ FN(get_branch_snapshot, 176, ##ctx) \ FN(trace_vprintk, 177, ##ctx) \ FN(skc_to_unix_sock, 178, ##ctx) \ FN(kallsyms_lookup_name, 179, ##ctx) \ FN(find_vma, 180, ##ctx) \ FN(loop, 181, ##ctx) \ FN(strncmp, 182, ##ctx) \ FN(get_func_arg, 183, ##ctx) \ FN(get_func_ret, 184, ##ctx) \ FN(get_func_arg_cnt, 185, ##ctx) \ FN(get_retval, 186, ##ctx) \ FN(set_retval, 187, ##ctx) \ FN(xdp_get_buff_len, 188, ##ctx) \ FN(xdp_load_bytes, 189, ##ctx) \ FN(xdp_store_bytes, 190, ##ctx) \ FN(copy_from_user_task, 191, ##ctx) \ FN(skb_set_tstamp, 192, ##ctx) \ FN(ima_file_hash, 193, ##ctx) \ FN(kptr_xchg, 194, ##ctx) \ FN(map_lookup_percpu_elem, 195, ##ctx) \ FN(skc_to_mptcp_sock, 196, ##ctx) \ FN(dynptr_from_mem, 197, ##ctx) \ FN(ringbuf_reserve_dynptr, 198, ##ctx) \ FN(ringbuf_submit_dynptr, 199, ##ctx) \ FN(ringbuf_discard_dynptr, 200, ##ctx) \ FN(dynptr_read, 201, ##ctx) \ FN(dynptr_write, 202, ##ctx) \ FN(dynptr_data, 203, ##ctx) \ FN(tcp_raw_gen_syncookie_ipv4, 204, ##ctx) \ FN(tcp_raw_gen_syncookie_ipv6, 205, ##ctx) \ FN(tcp_raw_check_syncookie_ipv4, 206, ##ctx) \ FN(tcp_raw_check_syncookie_ipv6, 207, ##ctx) \ FN(ktime_get_tai_ns, 208, ##ctx) \ FN(user_ringbuf_drain, 209, ##ctx) \ FN(cgrp_storage_get, 210, ##ctx) \ FN(cgrp_storage_delete, 211, ##ctx) \ /* */ /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't * know or care about integer value that is now passed as second argument */ #define __BPF_FUNC_MAPPER_APPLY(name, value, FN) FN(name), #define __BPF_FUNC_MAPPER(FN) ___BPF_FUNC_MAPPER(__BPF_FUNC_MAPPER_APPLY, FN) /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call */ #define __BPF_ENUM_FN(x, y) BPF_FUNC_ ## x = y, enum bpf_func_id { ___BPF_FUNC_MAPPER(__BPF_ENUM_FN) __BPF_FUNC_MAX_ID, }; #undef __BPF_ENUM_FN /* All flags used by eBPF helper functions, placed here. */ /* BPF_FUNC_skb_store_bytes flags. */ enum { BPF_F_RECOMPUTE_CSUM = (1ULL << 0), BPF_F_INVALIDATE_HASH = (1ULL << 1), }; /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. * First 4 bits are for passing the header field size. */ enum { BPF_F_HDR_FIELD_MASK = 0xfULL, }; /* BPF_FUNC_l4_csum_replace flags. */ enum { BPF_F_PSEUDO_HDR = (1ULL << 4), BPF_F_MARK_MANGLED_0 = (1ULL << 5), BPF_F_MARK_ENFORCE = (1ULL << 6), }; /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ enum { BPF_F_INGRESS = (1ULL << 0), }; /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ enum { BPF_F_TUNINFO_IPV6 = (1ULL << 0), }; /* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ enum { BPF_F_SKIP_FIELD_MASK = 0xffULL, BPF_F_USER_STACK = (1ULL << 8), /* flags used by BPF_FUNC_get_stackid only. */ BPF_F_FAST_STACK_CMP = (1ULL << 9), BPF_F_REUSE_STACKID = (1ULL << 10), /* flags used by BPF_FUNC_get_stack only. */ BPF_F_USER_BUILD_ID = (1ULL << 11), }; /* BPF_FUNC_skb_set_tunnel_key flags. */ enum { BPF_F_ZERO_CSUM_TX = (1ULL << 1), BPF_F_DONT_FRAGMENT = (1ULL << 2), BPF_F_SEQ_NUMBER = (1ULL << 3), BPF_F_NO_TUNNEL_KEY = (1ULL << 4), }; /* BPF_FUNC_skb_get_tunnel_key flags. */ enum { BPF_F_TUNINFO_FLAGS = (1ULL << 4), }; /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. */ enum { BPF_F_INDEX_MASK = 0xffffffffULL, BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, /* BPF_FUNC_perf_event_output for sk_buff input context. */ BPF_F_CTXLEN_MASK = (0xfffffULL << 32), }; /* Current network namespace */ enum { BPF_F_CURRENT_NETNS = (-1L), }; /* BPF_FUNC_csum_level level values. */ enum { BPF_CSUM_LEVEL_QUERY, BPF_CSUM_LEVEL_INC, BPF_CSUM_LEVEL_DEC, BPF_CSUM_LEVEL_RESET, }; /* BPF_FUNC_skb_adjust_room flags. */ enum { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), }; enum { BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff, BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56, }; #define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) /* BPF_FUNC_sysctl_get_name flags. */ enum { BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), }; /* BPF_FUNC__storage_get flags */ enum { BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0), /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead. */ BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE, }; /* BPF_FUNC_read_branch_records flags. */ enum { BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), }; /* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and * BPF_FUNC_bpf_ringbuf_output flags. */ enum { BPF_RB_NO_WAKEUP = (1ULL << 0), BPF_RB_FORCE_WAKEUP = (1ULL << 1), }; /* BPF_FUNC_bpf_ringbuf_query flags */ enum { BPF_RB_AVAIL_DATA = 0, BPF_RB_RING_SIZE = 1, BPF_RB_CONS_POS = 2, BPF_RB_PROD_POS = 3, }; /* BPF ring buffer constants */ enum { BPF_RINGBUF_BUSY_BIT = (1U << 31), BPF_RINGBUF_DISCARD_BIT = (1U << 30), BPF_RINGBUF_HDR_SZ = 8, }; /* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */ enum { BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0), BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1), }; /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, BPF_ADJ_ROOM_MAC, }; /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ enum bpf_hdr_start_off { BPF_HDR_START_MAC, BPF_HDR_START_NET, }; /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_SEG6, BPF_LWT_ENCAP_SEG6_INLINE, BPF_LWT_ENCAP_IP, }; /* Flags for bpf_bprm_opts_set helper */ enum { BPF_F_BPRM_SECUREEXEC = (1ULL << 0), }; /* Flags for bpf_redirect_map helper */ enum { BPF_F_BROADCAST = (1ULL << 3), BPF_F_EXCLUDE_INGRESS = (1ULL << 4), }; #define __bpf_md_ptr(type, name) \ union { \ type name; \ __u64 :64; \ } __attribute__((aligned(8))) enum { BPF_SKB_TSTAMP_UNSPEC, BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */ /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle, * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC * and try to deduce it by ingress, egress or skb->sk->sk_clockid. */ }; /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ struct __sk_buff { __u32 len; __u32 pkt_type; __u32 mark; __u32 queue_mapping; __u32 protocol; __u32 vlan_present; __u32 vlan_tci; __u32 vlan_proto; __u32 priority; __u32 ingress_ifindex; __u32 ifindex; __u32 tc_index; __u32 cb[5]; __u32 hash; __u32 tc_classid; __u32 data; __u32 data_end; __u32 napi_id; /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ __u32 family; __u32 remote_ip4; /* Stored in network byte order */ __u32 local_ip4; /* Stored in network byte order */ __u32 remote_ip6[4]; /* Stored in network byte order */ __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ /* ... here. */ __u32 data_meta; __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); __u64 tstamp; __u32 wire_len; __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); __u32 gso_size; __u8 tstamp_type; __u32 :24; /* Padding, future use. */ __u64 hwtstamp; }; struct bpf_tunnel_key { __u32 tunnel_id; union { __u32 remote_ipv4; __u32 remote_ipv6[4]; }; __u8 tunnel_tos; __u8 tunnel_ttl; union { __u16 tunnel_ext; /* compat */ __be16 tunnel_flags; }; __u32 tunnel_label; union { __u32 local_ipv4; __u32 local_ipv6[4]; }; }; /* user accessible mirror of in-kernel xfrm_state. * new fields can only be added to the end of this structure */ struct bpf_xfrm_state { __u32 reqid; __u32 spi; /* Stored in network byte order */ __u16 family; __u16 ext; /* Padding, future use. */ union { __u32 remote_ipv4; /* Stored in network byte order */ __u32 remote_ipv6[4]; /* Stored in network byte order */ }; }; /* Generic BPF return codes which all BPF program types may support. * The values are binary compatible with their TC_ACT_* counter-part to * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT * programs. * * XDP is handled seprately, see XDP_*. */ enum bpf_ret_code { BPF_OK = 0, /* 1 reserved */ BPF_DROP = 2, /* 3-6 reserved */ BPF_REDIRECT = 7, /* >127 are reserved for prog type specific return codes. * * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been * changed and should be routed based on its new L3 header. * (This is an L3 redirect, as opposed to L2 redirect * represented by BPF_REDIRECT above). */ BPF_LWT_REROUTE = 128, /* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR * to indicate that no custom dissection was performed, and * fallback to standard dissector is requested. */ BPF_FLOW_DISSECTOR_CONTINUE = 129, }; struct bpf_sock { __u32 bound_dev_if; __u32 family; __u32 type; __u32 protocol; __u32 mark; __u32 priority; /* IP address also allows 1 and 2 bytes access */ __u32 src_ip4; __u32 src_ip6[4]; __u32 src_port; /* host byte order */ __be16 dst_port; /* network byte order */ __u16 :16; /* zero padding */ __u32 dst_ip4; __u32 dst_ip6[4]; __u32 state; __s32 rx_queue_mapping; }; struct bpf_tcp_sock { __u32 snd_cwnd; /* Sending congestion window */ __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ __u32 rtt_min; __u32 snd_ssthresh; /* Slow start size threshold */ __u32 rcv_nxt; /* What we want to receive next */ __u32 snd_nxt; /* Next sequence we send */ __u32 snd_una; /* First byte we want an ack for */ __u32 mss_cache; /* Cached effective mss, not including SACKS */ __u32 ecn_flags; /* ECN status bits. */ __u32 rate_delivered; /* saved rate sample: packets delivered */ __u32 rate_interval_us; /* saved rate sample: time elapsed */ __u32 packets_out; /* Packets which are "in flight" */ __u32 retrans_out; /* Retransmitted packets out */ __u32 total_retrans; /* Total retransmits for entire connection */ __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn * total number of segments in. */ __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn * total number of data segments in. */ __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut * The total number of segments sent. */ __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut * total number of data segments sent. */ __u32 lost_out; /* Lost packets */ __u32 sacked_out; /* SACK'd packets */ __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived * sum(delta(rcv_nxt)), or how many bytes * were acked. */ __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. */ __u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups * total number of DSACK blocks received */ __u32 delivered; /* Total data packets delivered incl. rexmits */ __u32 delivered_ce; /* Like the above but only ECE marked packets */ __u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */ }; struct bpf_sock_tuple { union { struct { __be32 saddr; __be32 daddr; __be16 sport; __be16 dport; } ipv4; struct { __be32 saddr[4]; __be32 daddr[4]; __be16 sport; __be16 dport; } ipv6; }; }; struct bpf_xdp_sock { __u32 queue_id; }; #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other * return codes are reserved for future use. Unknown return codes will * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). */ enum xdp_action { XDP_ABORTED = 0, XDP_DROP, XDP_PASS, XDP_TX, XDP_REDIRECT, }; /* user accessible metadata for XDP packet hook * new fields must be added to the end of this structure */ struct xdp_md { __u32 data; __u32 data_end; __u32 data_meta; /* Below access go through struct xdp_rxq_info */ __u32 ingress_ifindex; /* rxq->dev->ifindex */ __u32 rx_queue_index; /* rxq->queue_index */ __u32 egress_ifindex; /* txq->dev->ifindex */ }; /* DEVMAP map-value layout * * The struct data-layout of map-value is a configuration interface. * New members can only be added to the end of this structure. */ struct bpf_devmap_val { __u32 ifindex; /* device index */ union { int fd; /* prog fd on map write */ __u32 id; /* prog id on map read */ } bpf_prog; }; /* CPUMAP map-value layout * * The struct data-layout of map-value is a configuration interface. * New members can only be added to the end of this structure. */ struct bpf_cpumap_val { __u32 qsize; /* queue size to remote target CPU */ union { int fd; /* prog fd on map write */ __u32 id; /* prog id on map read */ } bpf_prog; }; enum sk_action { SK_DROP = 0, SK_PASS, }; /* user accessible metadata for SK_MSG packet hook, new fields must * be added to the end of this structure */ struct sk_msg_md { __bpf_md_ptr(void *, data); __bpf_md_ptr(void *, data_end); __u32 family; __u32 remote_ip4; /* Stored in network byte order */ __u32 local_ip4; /* Stored in network byte order */ __u32 remote_ip6[4]; /* Stored in network byte order */ __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ __u32 size; /* Total size of sk_msg */ __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ }; struct sk_reuseport_md { /* * Start of directly accessible data. It begins from * the tcp/udp header. */ __bpf_md_ptr(void *, data); /* End of directly accessible data */ __bpf_md_ptr(void *, data_end); /* * Total length of packet (starting from the tcp/udp header). * Note that the directly accessible bytes (data_end - data) * could be less than this "len". Those bytes could be * indirectly read by a helper "bpf_skb_load_bytes()". */ __u32 len; /* * Eth protocol in the mac header (network byte order). e.g. * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) */ __u32 eth_protocol; __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ __u32 bind_inany; /* Is sock bound to an INANY address? */ __u32 hash; /* A hash of the packet 4 tuples */ /* When reuse->migrating_sk is NULL, it is selecting a sk for the * new incoming connection request (e.g. selecting a listen sk for * the received SYN in the TCP case). reuse->sk is one of the sk * in the reuseport group. The bpf prog can use reuse->sk to learn * the local listening ip/port without looking into the skb. * * When reuse->migrating_sk is not NULL, reuse->sk is closed and * reuse->migrating_sk is the socket that needs to be migrated * to another listening socket. migrating_sk could be a fullsock * sk that is fully established or a reqsk that is in-the-middle * of 3-way handshake. */ __bpf_md_ptr(struct bpf_sock *, sk); __bpf_md_ptr(struct bpf_sock *, migrating_sk); }; #define BPF_TAG_SIZE 8 struct bpf_prog_info { __u32 type; __u32 id; __u8 tag[BPF_TAG_SIZE]; __u32 jited_prog_len; __u32 xlated_prog_len; __aligned_u64 jited_prog_insns; __aligned_u64 xlated_prog_insns; __u64 load_time; /* ns since boottime */ __u32 created_by_uid; __u32 nr_map_ids; __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; __u32 ifindex; __u32 gpl_compatible:1; __u32 :31; /* alignment pad */ __u64 netns_dev; __u64 netns_ino; __u32 nr_jited_ksyms; __u32 nr_jited_func_lens; __aligned_u64 jited_ksyms; __aligned_u64 jited_func_lens; __u32 btf_id; __u32 func_info_rec_size; __aligned_u64 func_info; __u32 nr_func_info; __u32 nr_line_info; __aligned_u64 line_info; __aligned_u64 jited_line_info; __u32 nr_jited_line_info; __u32 line_info_rec_size; __u32 jited_line_info_rec_size; __u32 nr_prog_tags; __aligned_u64 prog_tags; __u64 run_time_ns; __u64 run_cnt; __u64 recursion_misses; __u32 verified_insns; __u32 attach_btf_obj_id; __u32 attach_btf_id; } __attribute__((aligned(8))); struct bpf_map_info { __u32 type; __u32 id; __u32 key_size; __u32 value_size; __u32 max_entries; __u32 map_flags; char name[BPF_OBJ_NAME_LEN]; __u32 ifindex; __u32 btf_vmlinux_value_type_id; __u64 netns_dev; __u64 netns_ino; __u32 btf_id; __u32 btf_key_type_id; __u32 btf_value_type_id; __u32 :32; /* alignment pad */ __u64 map_extra; } __attribute__((aligned(8))); struct bpf_btf_info { __aligned_u64 btf; __u32 btf_size; __u32 id; __aligned_u64 name; __u32 name_len; __u32 kernel_btf; } __attribute__((aligned(8))); struct bpf_link_info { __u32 type; __u32 id; __u32 prog_id; union { struct { __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ __u32 tp_name_len; /* in/out: tp_name buffer len */ } raw_tracepoint; struct { __u32 attach_type; __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ __u32 target_btf_id; /* BTF type id inside the object */ } tracing; struct { __u64 cgroup_id; __u32 attach_type; } cgroup; struct { __aligned_u64 target_name; /* in/out: target_name buffer ptr */ __u32 target_name_len; /* in/out: target_name buffer len */ /* If the iter specific field is 32 bits, it can be put * in the first or second union. Otherwise it should be * put in the second union. */ union { struct { __u32 map_id; } map; }; union { struct { __u64 cgroup_id; __u32 order; } cgroup; struct { __u32 tid; __u32 pid; } task; }; } iter; struct { __u32 netns_ino; __u32 attach_type; } netns; struct { __u32 ifindex; } xdp; }; } __attribute__((aligned(8))); /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach type). */ struct bpf_sock_addr { __u32 user_family; /* Allows 4-byte read, but no write. */ __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order */ __u32 family; /* Allows 4-byte read, but no write */ __u32 type; /* Allows 4-byte read, but no write */ __u32 protocol; /* Allows 4-byte read, but no write */ __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ __u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ __bpf_md_ptr(struct bpf_sock *, sk); }; /* User bpf_sock_ops struct to access socket values and specify request ops * and their replies. * Some of this fields are in network (bigendian) byte order and may need * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h). * New fields can only be added at the end of this structure */ struct bpf_sock_ops { __u32 op; union { __u32 args[4]; /* Optionally passed to bpf program */ __u32 reply; /* Returned by bpf program */ __u32 replylong[4]; /* Optionally returned by bpf prog */ }; __u32 family; __u32 remote_ip4; /* Stored in network byte order */ __u32 local_ip4; /* Stored in network byte order */ __u32 remote_ip6[4]; /* Stored in network byte order */ __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ __u32 is_fullsock; /* Some TCP fields are only valid if * there is a full socket. If not, the * fields read as zero. */ __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ __u32 state; __u32 rtt_min; __u32 snd_ssthresh; __u32 rcv_nxt; __u32 snd_nxt; __u32 snd_una; __u32 mss_cache; __u32 ecn_flags; __u32 rate_delivered; __u32 rate_interval_us; __u32 packets_out; __u32 retrans_out; __u32 total_retrans; __u32 segs_in; __u32 data_segs_in; __u32 segs_out; __u32 data_segs_out; __u32 lost_out; __u32 sacked_out; __u32 sk_txhash; __u64 bytes_received; __u64 bytes_acked; __bpf_md_ptr(struct bpf_sock *, sk); /* [skb_data, skb_data_end) covers the whole TCP header. * * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the * header has not been written. * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have * been written so far. * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes * the 3WHS. * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes * the 3WHS. * * bpf_load_hdr_opt() can also be used to read a particular option. */ __bpf_md_ptr(void *, skb_data); __bpf_md_ptr(void *, skb_data_end); __u32 skb_len; /* The total length of a packet. * It includes the header, options, * and payload. */ __u32 skb_tcp_flags; /* tcp_flags of the header. It provides * an easy way to check for tcp_flags * without parsing skb_data. * * In particular, the skb_tcp_flags * will still be available in * BPF_SOCK_OPS_HDR_OPT_LEN even though * the outgoing header has not * been written yet. */ __u64 skb_hwtstamp; }; /* Definitions for bpf_sock_ops_cb_flags */ enum { BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0), BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), /* Call bpf for all received TCP headers. The bpf prog will be * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB * * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB * for the header option related helpers that will be useful * to the bpf programs. * * It could be used at the client/active side (i.e. connect() side) * when the server told it that the server was in syncookie * mode and required the active side to resend the bpf-written * options. The active side can keep writing the bpf-options until * it received a valid packet from the server side to confirm * the earlier packet (and options) has been received. The later * example patch is using it like this at the active side when the * server is in syncookie mode. * * The bpf prog will usually turn this off in the common cases. */ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), /* Call bpf when kernel has received a header option that * the kernel cannot handle. The bpf prog will be called under * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB. * * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB * for the header option related helpers that will be useful * to the bpf programs. */ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), /* Call bpf when the kernel is writing header options for the * outgoing packet. The bpf prog will first be called * to reserve space in a skb under * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then * the bpf prog will be called to write the header option(s) * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB. * * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option * related helpers that will be useful to the bpf programs. * * The kernel gets its chance to reserve space and write * options first before the BPF program does. */ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, }; /* List of known BPF sock_ops operators. * New entries can only be added at the end */ enum { BPF_SOCK_OPS_VOID, BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or * -1 if default value should be used */ BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized * window (in packets) or -1 if default * value should be used */ BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an * active connection is initialized */ BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an * active connection is * established */ BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a * passive connection is * established */ BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control * needs ECN */ BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is * based on the path and may be * dependent on the congestion control * algorithm. In general it indicates * a congestion threshold. RTTs above * this indicate congestion */ BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. * Arg1: value of icsk_retransmits * Arg2: value of icsk_rto * Arg3: whether RTO has expired */ BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. * Arg1: sequence number of 1st byte * Arg2: # segments * Arg3: return value of * tcp_transmit_skb (0 => success) */ BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. * Arg1: old_state * Arg2: new_state */ BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after * socket transition to LISTEN state. */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. */ BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. * It will be called to handle * the packets received at * an already established * connection. * * sock_ops->skb_data: * Referring to the received skb. * It covers the TCP header only. * * bpf_load_hdr_opt() can also * be used to search for a * particular option. */ BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the * header option later in * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. * Arg1: bool want_cookie. (in * writing SYNACK only) * * sock_ops->skb_data: * Not available because no header has * been written yet. * * sock_ops->skb_tcp_flags: * The tcp_flags of the * outgoing skb. (e.g. SYN, ACK, FIN). * * bpf_reserve_hdr_opt() should * be used to reserve space. */ BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options * Arg1: bool want_cookie. (in * writing SYNACK only) * * sock_ops->skb_data: * Referring to the outgoing skb. * It covers the TCP header * that has already been written * by the kernel and the * earlier bpf-progs. * * sock_ops->skb_tcp_flags: * The tcp_flags of the outgoing * skb. (e.g. SYN, ACK, FIN). * * bpf_store_hdr_opt() should * be used to write the * option. * * bpf_load_hdr_opt() can also * be used to search for a * particular option that * has already been written * by the kernel or the * earlier bpf-progs. */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect * changes between the TCP and BPF versions. Ideally this should never happen. * If it does, we need to add code to convert them before calling * the BPF sock_ops function. */ enum { BPF_TCP_ESTABLISHED = 1, BPF_TCP_SYN_SENT, BPF_TCP_SYN_RECV, BPF_TCP_FIN_WAIT1, BPF_TCP_FIN_WAIT2, BPF_TCP_TIME_WAIT, BPF_TCP_CLOSE, BPF_TCP_CLOSE_WAIT, BPF_TCP_LAST_ACK, BPF_TCP_LISTEN, BPF_TCP_CLOSING, /* Now a valid state */ BPF_TCP_NEW_SYN_RECV, BPF_TCP_MAX_STATES /* Leave at the end! */ }; enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ /* Copy the SYN pkt to optval * * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit * to only getting from the saved_syn. It can either get the * syn packet from: * * 1. the just-received SYN packet (only available when writing the * SYNACK). It will be useful when it is not necessary to * save the SYN packet for latter use. It is also the only way * to get the SYN during syncookie mode because the syn * packet cannot be saved during syncookie. * * OR * * 2. the earlier saved syn which was done by * bpf_setsockopt(TCP_SAVE_SYN). * * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the * SYN packet is obtained. * * If the bpf-prog does not need the IP[46] header, the * bpf-prog can avoid parsing the IP header by using * TCP_BPF_SYN. Otherwise, the bpf-prog can get both * IP[46] and TCP header by using TCP_BPF_SYN_IP. * * >0: Total number of bytes copied * -ENOSPC: Not enough space in optval. Only optlen number of * bytes is copied. * -ENOENT: The SYN skb is not available now and the earlier SYN pkt * is not saved by setsockopt(TCP_SAVE_SYN). */ TCP_BPF_SYN = 1005, /* Copy the TCP header */ TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ }; enum { BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), }; /* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. */ enum { BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the * total option spaces * required for an established * sk in order to calculate the * MSS. No skb is actually * sent. */ BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode * when sending a SYN. */ }; struct bpf_perf_event_value { __u64 counter; __u64 enabled; __u64 running; }; enum { BPF_DEVCG_ACC_MKNOD = (1ULL << 0), BPF_DEVCG_ACC_READ = (1ULL << 1), BPF_DEVCG_ACC_WRITE = (1ULL << 2), }; enum { BPF_DEVCG_DEV_BLOCK = (1ULL << 0), BPF_DEVCG_DEV_CHAR = (1ULL << 1), }; struct bpf_cgroup_dev_ctx { /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ __u32 access_type; __u32 major; __u32 minor; }; struct bpf_raw_tracepoint_args { __u64 args[0]; }; /* DIRECT: Skip the FIB rules and go to FIB table associated with device * OUTPUT: Do lookup from egress perspective; default is ingress */ enum { BPF_FIB_LOOKUP_DIRECT = (1U << 0), BPF_FIB_LOOKUP_OUTPUT = (1U << 1), BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), }; enum { BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */ BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */ BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */ BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */ BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ }; struct bpf_fib_lookup { /* input: network family for lookup (AF_INET, AF_INET6) * output: network family of egress nexthop */ __u8 family; /* set if lookup is to consider L4 data - e.g., FIB rules */ __u8 l4_protocol; __be16 sport; __be16 dport; union { /* used for MTU check */ /* input to lookup */ __u16 tot_len; /* L3 length from network hdr (iph->tot_len) */ /* output: MTU value */ __u16 mtu_result; }; /* input: L3 device index for lookup * output: device index from FIB lookup */ __u32 ifindex; union { /* inputs to lookup */ __u8 tos; /* AF_INET */ __be32 flowinfo; /* AF_INET6, flow_label + priority */ /* output: metric of fib result (IPv4/IPv6 only) */ __u32 rt_metric; }; union { __be32 ipv4_src; __u32 ipv6_src[4]; /* in6_addr; network order */ }; /* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in * network header. output: bpf_fib_lookup sets to gateway address * if FIB lookup returns gateway route */ union { __be32 ipv4_dst; __u32 ipv6_dst[4]; /* in6_addr; network order */ }; /* output */ __be16 h_vlan_proto; __be16 h_vlan_TCI; __u8 smac[6]; /* ETH_ALEN */ __u8 dmac[6]; /* ETH_ALEN */ }; struct bpf_redir_neigh { /* network family for lookup (AF_INET, AF_INET6) */ __u32 nh_family; /* network address of nexthop; skips fib lookup to find gateway */ union { __be32 ipv4_nh; __u32 ipv6_nh[4]; /* in6_addr; network order */ }; }; /* bpf_check_mtu flags*/ enum bpf_check_mtu_flags { BPF_MTU_CHK_SEGS = (1U << 0), }; enum bpf_check_mtu_ret { BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */ BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */ BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */ }; enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ BPF_FD_TYPE_UPROBE, /* filename + offset */ BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; enum { BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0), BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1), BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2), }; struct bpf_flow_keys { __u16 nhoff; __u16 thoff; __u16 addr_proto; /* ETH_P_* of valid addrs */ __u8 is_frag; __u8 is_first_frag; __u8 is_encap; __u8 ip_proto; __be16 n_proto; __be16 sport; __be16 dport; union { struct { __be32 ipv4_src; __be32 ipv4_dst; }; struct { __u32 ipv6_src[4]; /* in6_addr; network order */ __u32 ipv6_dst[4]; /* in6_addr; network order */ }; }; __u32 flags; __be32 flow_label; }; struct bpf_func_info { __u32 insn_off; __u32 type_id; }; #define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) #define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) struct bpf_line_info { __u32 insn_off; __u32 file_name_off; __u32 line_off; __u32 line_col; }; struct bpf_spin_lock { __u32 val; }; struct bpf_timer { __u64 :64; __u64 :64; } __attribute__((aligned(8))); struct bpf_dynptr { __u64 :64; __u64 :64; } __attribute__((aligned(8))); struct bpf_list_head { __u64 :64; __u64 :64; } __attribute__((aligned(8))); struct bpf_list_node { __u64 :64; __u64 :64; } __attribute__((aligned(8))); struct bpf_rb_root { __u64 :64; __u64 :64; } __attribute__((aligned(8))); struct bpf_rb_node { __u64 :64; __u64 :64; __u64 :64; } __attribute__((aligned(8))); struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. */ __u32 file_pos; /* Sysctl file position to read from, write to. * Allows 1,2,4-byte read an 4-byte write. */ }; struct bpf_sockopt { __bpf_md_ptr(struct bpf_sock *, sk); __bpf_md_ptr(void *, optval); __bpf_md_ptr(void *, optval_end); __s32 level; __s32 optname; __s32 optlen; __s32 retval; }; struct bpf_pidns_info { __u32 pid; __u32 tgid; }; /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ struct bpf_sk_lookup { union { __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ }; __u32 family; /* Protocol family (AF_INET, AF_INET6) */ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ __u32 remote_ip4; /* Network byte order */ __u32 remote_ip6[4]; /* Network byte order */ __be16 remote_port; /* Network byte order */ __u16 :16; /* Zero padding */ __u32 local_ip4; /* Network byte order */ __u32 local_ip6[4]; /* Network byte order */ __u32 local_port; /* Host byte order */ __u32 ingress_ifindex; /* The arriving interface. Determined by inet_iif. */ }; /* * struct btf_ptr is used for typed pointer representation; the * type id is used to render the pointer data as the appropriate type * via the bpf_snprintf_btf() helper described above. A flags field - * potentially to specify additional details about the BTF pointer * (rather than its mode of display) - is included for future use. * Display flags - BTF_F_* - are passed to bpf_snprintf_btf separately. */ struct btf_ptr { void *ptr; __u32 type_id; __u32 flags; /* BTF ptr flags; unused at present. */ }; /* * Flags to control bpf_snprintf_btf() behaviour. * - BTF_F_COMPACT: no formatting around type information * - BTF_F_NONAME: no struct/union member names/types * - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; * equivalent to %px. * - BTF_F_ZERO: show zero-valued struct/union members; they * are not displayed by default */ enum { BTF_F_COMPACT = (1ULL << 0), BTF_F_NONAME = (1ULL << 1), BTF_F_PTR_RAW = (1ULL << 2), BTF_F_ZERO = (1ULL << 3), }; /* bpf_core_relo_kind encodes which aspect of captured field/type/enum value * has to be adjusted by relocations. It is emitted by llvm and passed to * libbpf and later to the kernel. */ enum bpf_core_relo_kind { BPF_CORE_FIELD_BYTE_OFFSET = 0, /* field byte offset */ BPF_CORE_FIELD_BYTE_SIZE = 1, /* field size in bytes */ BPF_CORE_FIELD_EXISTS = 2, /* field existence in target kernel */ BPF_CORE_FIELD_SIGNED = 3, /* field signedness (0 - unsigned, 1 - signed) */ BPF_CORE_FIELD_LSHIFT_U64 = 4, /* bitfield-specific left bitshift */ BPF_CORE_FIELD_RSHIFT_U64 = 5, /* bitfield-specific right bitshift */ BPF_CORE_TYPE_ID_LOCAL = 6, /* type ID in local BPF object */ BPF_CORE_TYPE_ID_TARGET = 7, /* type ID in target kernel */ BPF_CORE_TYPE_EXISTS = 8, /* type existence in target kernel */ BPF_CORE_TYPE_SIZE = 9, /* type size in bytes */ BPF_CORE_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ BPF_CORE_TYPE_MATCHES = 12, /* type match in target kernel */ }; /* * "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf * and from libbpf to the kernel. * * CO-RE relocation captures the following data: * - insn_off - instruction offset (in bytes) within a BPF program that needs * its insn->imm field to be relocated with actual field info; * - type_id - BTF type ID of the "root" (containing) entity of a relocatable * type or field; * - access_str_off - offset into corresponding .BTF string section. String * interpretation depends on specific relocation kind: * - for field-based relocations, string encodes an accessed field using * a sequence of field and array indices, separated by colon (:). It's * conceptually very close to LLVM's getelementptr ([0]) instruction's * arguments for identifying offset to a field. * - for type-based relocations, strings is expected to be just "0"; * - for enum value-based relocations, string contains an index of enum * value within its enum type; * - kind - one of enum bpf_core_relo_kind; * * Example: * struct sample { * int a; * struct { * int b[10]; * }; * }; * * struct sample *s = ...; * int *x = &s->a; // encoded as "0:0" (a is field #0) * int *y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, * // b is field #0 inside anon struct, accessing elem #5) * int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) * * type_id for all relocs in this example will capture BTF type id of * `struct sample`. * * Such relocation is emitted when using __builtin_preserve_access_index() * Clang built-in, passing expression that captures field address, e.g.: * * bpf_probe_read(&dst, sizeof(dst), * __builtin_preserve_access_index(&src->a.b.c)); * * In this case Clang will emit field relocation recording necessary data to * be able to find offset of embedded `a.b.c` field within `src` struct. * * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction */ struct bpf_core_relo { __u32 insn_off; __u32 type_id; __u32 access_str_off; enum bpf_core_relo_kind kind; }; #endif /* _UAPI__LINUX_BPF_H__ */ xdp-tools-1.6.1/headers/linux/btf.h000066400000000000000000000127461514310632100171540ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* Copyright (c) 2018 Facebook */ #ifndef _UAPI__LINUX_BTF_H__ #define _UAPI__LINUX_BTF_H__ #include #define BTF_MAGIC 0xeB9F #define BTF_VERSION 1 struct btf_header { __u16 magic; __u8 version; __u8 flags; __u32 hdr_len; /* All offsets are in bytes relative to the end of this header */ __u32 type_off; /* offset of type section */ __u32 type_len; /* length of type section */ __u32 str_off; /* offset of string section */ __u32 str_len; /* length of string section */ }; /* Max # of type identifier */ #define BTF_MAX_TYPE 0x000fffff /* Max offset into the string section */ #define BTF_MAX_NAME_OFFSET 0x00ffffff /* Max # of struct/union/enum members or func args */ #define BTF_MAX_VLEN 0xffff struct btf_type { __u32 name_off; /* "info" bits arrangement * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused * bits 24-28: kind (e.g. int, ptr, array...etc) * bits 29-30: unused * bit 31: kind_flag, currently used by * struct, union, enum, fwd and enum64 */ __u32 info; /* "size" is used by INT, ENUM, STRUCT, UNION, DATASEC and ENUM64. * "size" tells the size of the type it is describing. * * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, * FUNC, FUNC_PROTO, VAR, DECL_TAG and TYPE_TAG. * "type" is a type_id referring to another type. */ union { __u32 size; __u32 type; }; }; #define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) enum { BTF_KIND_UNKN = 0, /* Unknown */ BTF_KIND_INT = 1, /* Integer */ BTF_KIND_PTR = 2, /* Pointer */ BTF_KIND_ARRAY = 3, /* Array */ BTF_KIND_STRUCT = 4, /* Struct */ BTF_KIND_UNION = 5, /* Union */ BTF_KIND_ENUM = 6, /* Enumeration up to 32-bit values */ BTF_KIND_FWD = 7, /* Forward */ BTF_KIND_TYPEDEF = 8, /* Typedef */ BTF_KIND_VOLATILE = 9, /* Volatile */ BTF_KIND_CONST = 10, /* Const */ BTF_KIND_RESTRICT = 11, /* Restrict */ BTF_KIND_FUNC = 12, /* Function */ BTF_KIND_FUNC_PROTO = 13, /* Function Proto */ BTF_KIND_VAR = 14, /* Variable */ BTF_KIND_DATASEC = 15, /* Section */ BTF_KIND_FLOAT = 16, /* Floating point */ BTF_KIND_DECL_TAG = 17, /* Decl Tag */ BTF_KIND_TYPE_TAG = 18, /* Type Tag */ BTF_KIND_ENUM64 = 19, /* Enumeration up to 64-bit values */ NR_BTF_KINDS, BTF_KIND_MAX = NR_BTF_KINDS - 1, }; /* For some specific BTF_KIND, "struct btf_type" is immediately * followed by extra data. */ /* BTF_KIND_INT is followed by a u32 and the following * is the 32 bits arrangement: */ #define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) #define BTF_INT_OFFSET(VAL) (((VAL) & 0x00ff0000) >> 16) #define BTF_INT_BITS(VAL) ((VAL) & 0x000000ff) /* Attributes stored in the BTF_INT_ENCODING */ #define BTF_INT_SIGNED (1 << 0) #define BTF_INT_CHAR (1 << 1) #define BTF_INT_BOOL (1 << 2) /* BTF_KIND_ENUM is followed by multiple "struct btf_enum". * The exact number of btf_enum is stored in the vlen (of the * info in "struct btf_type"). */ struct btf_enum { __u32 name_off; __s32 val; }; /* BTF_KIND_ARRAY is followed by one "struct btf_array" */ struct btf_array { __u32 type; __u32 index_type; __u32 nelems; }; /* BTF_KIND_STRUCT and BTF_KIND_UNION are followed * by multiple "struct btf_member". The exact number * of btf_member is stored in the vlen (of the info in * "struct btf_type"). */ struct btf_member { __u32 name_off; __u32 type; /* If the type info kind_flag is set, the btf_member offset * contains both member bitfield size and bit offset. The * bitfield size is set for bitfield members. If the type * info kind_flag is not set, the offset contains only bit * offset. */ __u32 offset; }; /* If the struct/union type info kind_flag is set, the * following two macros are used to access bitfield_size * and bit_offset from btf_member.offset. */ #define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) #define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) /* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". * The exact number of btf_param is stored in the vlen (of the * info in "struct btf_type"). */ struct btf_param { __u32 name_off; __u32 type; }; enum { BTF_VAR_STATIC = 0, BTF_VAR_GLOBAL_ALLOCATED = 1, BTF_VAR_GLOBAL_EXTERN = 2, }; enum btf_func_linkage { BTF_FUNC_STATIC = 0, BTF_FUNC_GLOBAL = 1, BTF_FUNC_EXTERN = 2, }; /* BTF_KIND_VAR is followed by a single "struct btf_var" to describe * additional information related to the variable such as its linkage. */ struct btf_var { __u32 linkage; }; /* BTF_KIND_DATASEC is followed by multiple "struct btf_var_secinfo" * to describe all BTF_KIND_VAR types it contains along with it's * in-section offset as well as size. */ struct btf_var_secinfo { __u32 type; __u32 offset; __u32 size; }; /* BTF_KIND_DECL_TAG is followed by a single "struct btf_decl_tag" to describe * additional information related to the tag applied location. * If component_idx == -1, the tag is applied to a struct, union, * variable or function. Otherwise, it is applied to a struct/union * member or a func argument, and component_idx indicates which member * or argument (0 ... vlen-1). */ struct btf_decl_tag { __s32 component_idx; }; /* BTF_KIND_ENUM64 is followed by multiple "struct btf_enum64". * The exact number of btf_enum64 is stored in the vlen (of the * info in "struct btf_type"). */ struct btf_enum64 { __u32 name_off; __u32 val_lo32; __u32 val_hi32; }; #endif /* _UAPI__LINUX_BTF_H__ */ xdp-tools-1.6.1/headers/linux/compiler-gcc.h000066400000000000000000000022131514310632100207310ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _TOOLS_LINUX_COMPILER_H_ #error "Please don't include directly, include instead." #endif /* * Common definitions for all gcc versions go here. */ #ifndef GCC_VERSION #define GCC_VERSION (__GNUC__ * 10000 \ + __GNUC_MINOR__ * 100 \ + __GNUC_PATCHLEVEL__) #endif #if GCC_VERSION >= 70000 && !defined(__CHECKER__) # define __fallthrough __attribute__ ((fallthrough)) #endif #if __has_attribute(__error__) # define __compiletime_error(message) __attribute__((error(message))) #endif /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) #ifndef __pure #define __pure __attribute__((pure)) #endif #define noinline __attribute__((noinline)) #ifndef __packed #define __packed __attribute__((packed)) #endif #ifndef __noreturn #define __noreturn __attribute__((noreturn)) #endif #ifndef __aligned #define __aligned(x) __attribute__((aligned(x))) #endif #define __printf(a, b) __attribute__((format(printf, a, b))) #define __scanf(a, b) __attribute__((format(scanf, a, b))) xdp-tools-1.6.1/headers/linux/compiler.h000066400000000000000000000131441514310632100202040ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _TOOLS_LINUX_COMPILER_H_ #define _TOOLS_LINUX_COMPILER_H_ #include #ifndef __compiletime_error # define __compiletime_error(message) #endif #ifdef __OPTIMIZE__ # define __compiletime_assert(condition, msg, prefix, suffix) \ do { \ extern void prefix ## suffix(void) __compiletime_error(msg); \ if (!(condition)) \ prefix ## suffix(); \ } while (0) #else # define __compiletime_assert(condition, msg, prefix, suffix) do { } while (0) #endif #define _compiletime_assert(condition, msg, prefix, suffix) \ __compiletime_assert(condition, msg, prefix, suffix) /** * compiletime_assert - break build and emit msg if condition is false * @condition: a compile-time constant condition to check * @msg: a message to emit if condition is false * * In tradition of POSIX assert, this macro will break the build if the * supplied condition is *false*, emitting the supplied error message if the * compiler has support to do so. */ #define compiletime_assert(condition, msg) \ _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) /* Optimization barrier */ /* The "volatile" is due to gcc bugs */ #define barrier() __asm__ __volatile__("": : :"memory") #ifndef __always_inline # define __always_inline inline __attribute__((always_inline)) #endif #ifndef noinline #define noinline #endif /* Are two types/vars the same type (ignoring qualifiers)? */ #ifndef __same_type # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) #endif #ifdef __ANDROID__ /* * Big hammer to get rid of tons of: * "warning: always_inline function might not be inlinable" * * At least on android-ndk-r12/platforms/android-24/arch-arm */ #undef __always_inline #define __always_inline inline #endif #define __user #define __rcu #define __read_mostly #ifndef __attribute_const__ # define __attribute_const__ #endif #ifndef __maybe_unused # define __maybe_unused __attribute__((unused)) #endif #ifndef __used # define __used __attribute__((__unused__)) #endif #ifndef __packed # define __packed __attribute__((__packed__)) #endif #ifndef __force # define __force #endif #ifndef __weak # define __weak __attribute__((weak)) #endif #ifndef likely # define likely(x) __builtin_expect(!!(x), 1) #endif #ifndef unlikely # define unlikely(x) __builtin_expect(!!(x), 0) #endif #ifndef __init # define __init #endif #ifndef noinline # define noinline #endif #include /* * Following functions are taken from kernel sources and * break aliasing rules in their original form. * * While kernel is compiled with -fno-strict-aliasing, * perf uses -Wstrict-aliasing=3 which makes build fail * under gcc 4.4. * * Using extra __may_alias__ type to allow aliasing * in this case. */ typedef __u8 __attribute__((__may_alias__)) __u8_alias_t; typedef __u16 __attribute__((__may_alias__)) __u16_alias_t; typedef __u32 __attribute__((__may_alias__)) __u32_alias_t; typedef __u64 __attribute__((__may_alias__)) __u64_alias_t; static __always_inline void __read_once_size(const volatile void *p, void *res, int size) { switch (size) { case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break; case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break; case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break; case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break; default: barrier(); __builtin_memcpy((void *)res, (const void *)p, size); barrier(); } } static __always_inline void __write_once_size(volatile void *p, void *res, int size) { switch (size) { case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break; case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break; case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break; case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break; default: barrier(); __builtin_memcpy((void *)p, (const void *)res, size); barrier(); } } /* * Prevent the compiler from merging or refetching reads or writes. The * compiler is also forbidden from reordering successive instances of * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some * particular ordering. One way to make the compiler aware of ordering is to * put the two invocations of READ_ONCE or WRITE_ONCE in different C * statements. * * These two macros will also work on aggregate data types like structs or * unions. If the size of the accessed data type exceeds the word size of * the machine (e.g., 32 bits or 64 bits) READ_ONCE() and WRITE_ONCE() will * fall back to memcpy and print a compile-time warning. * * Their two major use cases are: (1) Mediating communication between * process-level code and irq/NMI handlers, all running on the same CPU, * and (2) Ensuring that the compiler does not fold, spindle, or otherwise * mutilate accesses that either do not require ordering or that interact * with an explicit memory barrier or atomic instruction that provides the * required ordering. */ #define READ_ONCE(x) \ ({ \ union { typeof(x) __val; char __c[1]; } __u = \ { .__c = { 0 } }; \ __read_once_size(&(x), __u.__c, sizeof(x)); \ __u.__val; \ }) #define WRITE_ONCE(x, val) \ ({ \ union { typeof(x) __val; char __c[1]; } __u = \ { .__val = (val) }; \ __write_once_size(&(x), __u.__c, sizeof(x)); \ __u.__val; \ }) #ifndef __fallthrough # define __fallthrough #endif /* Indirect macros required for expanded argument pasting, eg. __LINE__. */ #define ___PASTE(a, b) a##b #define __PASTE(a, b) ___PASTE(a, b) #endif /* _TOOLS_LINUX_COMPILER_H */ xdp-tools-1.6.1/headers/linux/compiler_types.h000066400000000000000000000020231514310632100214220ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_COMPILER_TYPES_H #define __LINUX_COMPILER_TYPES_H /* Builtins */ /* * __has_builtin is supported on gcc >= 10, clang >= 3 and icc >= 21. * In the meantime, to support gcc < 10, we implement __has_builtin * by hand. */ #ifndef __has_builtin #define __has_builtin(x) (0) #endif #ifdef __CHECKER__ /* context/locking */ # define __must_hold(x) __attribute__((context(x,1,1))) # define __acquires(x) __attribute__((context(x,0,1))) # define __releases(x) __attribute__((context(x,1,0))) # define __acquire(x) __context__(x,1) # define __release(x) __context__(x,-1) # define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) #else /* __CHECKER__ */ /* context/locking */ # define __must_hold(x) # define __acquires(x) # define __releases(x) # define __acquire(x) (void)0 # define __release(x) (void)0 # define __cond_lock(x,c) (c) #endif /* __CHECKER__ */ /* Compiler specific macros. */ #ifdef __GNUC__ #include #endif #endif /* __LINUX_COMPILER_TYPES_H */ xdp-tools-1.6.1/headers/linux/err.h000066400000000000000000000011421514310632100171550ustar00rootroot00000000000000/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ #ifndef __LINUX_ERR_H #define __LINUX_ERR_H #include #include #include #define MAX_ERRNO 4095 #define IS_ERR_VALUE(x) ((x) >= (unsigned long)-MAX_ERRNO) static inline void * ERR_PTR(long error_) { return (void *) error_; } static inline long PTR_ERR(const void *ptr) { return (long) ptr; } static inline bool IS_ERR(const void *ptr) { return IS_ERR_VALUE((unsigned long)ptr); } static inline bool IS_ERR_OR_NULL(const void *ptr) { return (!ptr) || IS_ERR_VALUE((unsigned long)ptr); } #endif xdp-tools-1.6.1/headers/linux/hashtable.h000066400000000000000000000127701514310632100203310ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ /* * Statically sized hash table implementation * (C) 2012 Sasha Levin */ #ifndef _LINUX_HASHTABLE_H #define _LINUX_HASHTABLE_H #include #include #include #include #include #include #include #include "../lib/util/util.h" #define HASH_GOLDEN_RATIO_32 0x61C88647 #define HASH_GOLDEN_RATIO_64 0x61C8864680B583EBull #if (__SIZEOF_LONG__ * __CHAR_BIT__) == 32 #define HASH_GOLDEN_RATIO_PRIME HASH_GOLDEN_RATIO_32 #define hash_long(val, bits) hash_32(val, bits) #elif (__SIZEOF_LONG__ * __CHAR_BIT__) == 64 #define hash_long(val, bits) hash_64(val, bits) #define HASH_GOLDEN_RATIO_PRIME HASH_GOLDEN_RATIO_64 #else #error "Wordsize not 32 or 64" #endif static inline uint32_t __hash_32(uint32_t val) { return val * HASH_GOLDEN_RATIO_32; } static inline uint32_t hash_32(uint32_t val, unsigned int bits) { /* High bits are more random, so use them. */ return __hash_32(val) >> (32 - bits); } static inline uint32_t hash_64(uint64_t val, unsigned int bits) { #if LONG_TYPE_SIZE * CHAR_BIT == 64 /* 64x64-bit multiply is efficient on all 64-bit processors */ return val * HASH_GOLDEN_RATIO_64 >> (64 - bits); #else /* Hash 64 bits using only 32x32-bit multiply. */ return hash_32((uint32_t)val ^ __hash_32(val >> 32), bits); #endif } #define DEFINE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] = \ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } #define DECLARE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] #define HASH_SIZE(name) (ARRAY_SIZE(name)) #define HASH_BITS(name) ilogb(HASH_SIZE(name)) /* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */ #define hash_min(val, bits) \ (sizeof(val) <= 4 ? hash_32((uint32_t)val, bits) : hash_long((uint64_t)val, bits)) static inline void __hash_init(struct hlist_head *ht, unsigned int sz) { unsigned int i; for (i = 0; i < sz; i++) INIT_HLIST_HEAD(&ht[i]); } /** * hash_init - initialize a hash table * @hashtable: hashtable to be initialized * * Calculates the size of the hashtable from the given parameter, otherwise * same as hash_init_size. * * This has to be a macro since HASH_BITS() will not work on pointers since * it calculates the size during preprocessing. */ #define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable)) /** * hash_add - add an object to a hashtable * @hashtable: hashtable to add to * @node: the &struct hlist_node of the object to be added * @key: the key of the object to be added */ #define hash_add(hashtable, node, key) \ hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) /** * hash_hashed - check whether an object is in any hashtable * @node: the &struct hlist_node of the object to be checked */ static inline bool hash_hashed(struct hlist_node *node) { return !hlist_unhashed(node); } static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz) { unsigned int i; for (i = 0; i < sz; i++) if (!hlist_empty(&ht[i])) return false; return true; } /** * hash_empty - check whether a hashtable is empty * @hashtable: hashtable to check * * This has to be a macro since HASH_BITS() will not work on pointers since * it calculates the size during preprocessing. */ #define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable)) /** * hash_del - remove an object from a hashtable * @node: &struct hlist_node of the object to remove */ static inline void hash_del(struct hlist_node *node) { hlist_del_init(node); } /** * hash_for_each - iterate over a hashtable * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each(name, bkt, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry(obj, &name[bkt], member) /** * hash_for_each_safe - iterate over a hashtable safe against removal of * hash entry * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @tmp: a &struct used for temporary storage * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each_safe(name, bkt, tmp, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry_safe(obj, tmp, &name[bkt], member) /** * hash_for_each_possible - iterate over all possible objects hashing to the * same bucket * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible(name, obj, member, key) \ hlist_for_each_entry(obj, &name[hash_min(key, HASH_BITS(name))], member) /** * hash_for_each_possible_safe - iterate over all possible objects hashing to the * same bucket safe against removals * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @tmp: a &struct used for temporary storage * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible_safe(name, obj, tmp, member, key) \ hlist_for_each_entry_safe(obj, tmp,\ &name[hash_min(key, HASH_BITS(name))], member) #endif xdp-tools-1.6.1/headers/linux/hlist.h000066400000000000000000000120301514310632100175060ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_HLIST_H #define __LINUX_HLIST_H struct list_head; struct rhash_head { struct rhash_head *next; }; #define HLIST_POISON_POINTER_DELTA 0 #define HLIST_POISON1 ((void *) 0x100 + HLIST_POISON_POINTER_DELTA) #define HLIST_POISON2 ((void *) 0x200 + HLIST_POISON_POINTER_DELTA) /* * Double linked lists with a single pointer list head. * Mostly useful for hash tables where the two pointer list head is * too wasteful. * You lose the ability to access the tail in O(1). */ struct hlist_head { struct hlist_node *first; }; struct hlist_node { struct hlist_node *next, **pprev; }; #define HLIST_HEAD_INIT { .first = NULL } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } static inline int hlist_empty(const struct hlist_head *h) { return !h->first; } static inline void __hlist_del(struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; __atomic_store_n(pprev, next, __ATOMIC_RELAXED); if (next) next->pprev = pprev; } static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); n->next = HLIST_POISON1; n->pprev = HLIST_POISON2; } static inline void hlist_del_init(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); INIT_HLIST_NODE(n); } } static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; n->next = first; if (first) first->pprev = &n->next; h->first = n; n->pprev = &h->first; } /* next must be != NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { n->pprev = next->pprev; n->next = next; next->pprev = &n->next; *(n->pprev) = n; } static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { n->next = prev->next; prev->next = n; n->pprev = &prev->next; if (n->next) n->next->pprev = &n->next; } /* after that we'll appear to be on some hlist and hlist_del will work */ static inline void hlist_add_fake(struct hlist_node *n) { n->pprev = &n->next; } static inline bool hlist_fake(struct hlist_node *h) { return h->pprev == &h->next; } /* * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. */ static inline void hlist_move_list(struct hlist_head *old, struct hlist_head *new) { new->first = old->first; if (new->first) new->first->pprev = &new->first; old->first = NULL; } #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ for (pos = (head)->first; pos ; pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ pos = n) #define hlist_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ }) /** * hlist_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry(pos, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_continue - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue(pos, member) \ for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_from - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from(pos, member) \ for (; pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_safe(pos, n, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ pos && ({ n = pos->member.next; 1; }); \ pos = hlist_entry_safe(n, typeof(*pos), member)) /** * list_for_each_from - iterate over a list from one of its nodes * @pos: the &struct list_head to use as a loop cursor, from where to start * @head: the head for your list. */ #define list_for_each_from(pos, head) \ for (; pos != (head); pos = pos->next) #endif xdp-tools-1.6.1/headers/linux/icmp.h000066400000000000000000000113001514310632100173120ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the ICMP protocol. * * Version: @(#)icmp.h 1.0.3 04/28/93 * * Author: Fred N. van Kempen, * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #ifndef _UAPI_LINUX_ICMP_H #define _UAPI_LINUX_ICMP_H #include #include #include #include #define ICMP_ECHOREPLY 0 /* Echo Reply */ #define ICMP_DEST_UNREACH 3 /* Destination Unreachable */ #define ICMP_SOURCE_QUENCH 4 /* Source Quench */ #define ICMP_REDIRECT 5 /* Redirect (change route) */ #define ICMP_ECHO 8 /* Echo Request */ #define ICMP_TIME_EXCEEDED 11 /* Time Exceeded */ #define ICMP_PARAMETERPROB 12 /* Parameter Problem */ #define ICMP_TIMESTAMP 13 /* Timestamp Request */ #define ICMP_TIMESTAMPREPLY 14 /* Timestamp Reply */ #define ICMP_INFO_REQUEST 15 /* Information Request */ #define ICMP_INFO_REPLY 16 /* Information Reply */ #define ICMP_ADDRESS 17 /* Address Mask Request */ #define ICMP_ADDRESSREPLY 18 /* Address Mask Reply */ #define NR_ICMP_TYPES 18 /* Codes for UNREACH. */ #define ICMP_NET_UNREACH 0 /* Network Unreachable */ #define ICMP_HOST_UNREACH 1 /* Host Unreachable */ #define ICMP_PROT_UNREACH 2 /* Protocol Unreachable */ #define ICMP_PORT_UNREACH 3 /* Port Unreachable */ #define ICMP_FRAG_NEEDED 4 /* Fragmentation Needed/DF set */ #define ICMP_SR_FAILED 5 /* Source Route failed */ #define ICMP_NET_UNKNOWN 6 #define ICMP_HOST_UNKNOWN 7 #define ICMP_HOST_ISOLATED 8 #define ICMP_NET_ANO 9 #define ICMP_HOST_ANO 10 #define ICMP_NET_UNR_TOS 11 #define ICMP_HOST_UNR_TOS 12 #define ICMP_PKT_FILTERED 13 /* Packet filtered */ #define ICMP_PREC_VIOLATION 14 /* Precedence violation */ #define ICMP_PREC_CUTOFF 15 /* Precedence cut off */ #define NR_ICMP_UNREACH 15 /* instead of hardcoding immediate value */ /* Codes for REDIRECT. */ #define ICMP_REDIR_NET 0 /* Redirect Net */ #define ICMP_REDIR_HOST 1 /* Redirect Host */ #define ICMP_REDIR_NETTOS 2 /* Redirect Net for TOS */ #define ICMP_REDIR_HOSTTOS 3 /* Redirect Host for TOS */ /* Codes for TIME_EXCEEDED. */ #define ICMP_EXC_TTL 0 /* TTL count exceeded */ #define ICMP_EXC_FRAGTIME 1 /* Fragment Reass time exceeded */ /* Codes for EXT_ECHO (PROBE) */ #define ICMP_EXT_ECHO 42 #define ICMP_EXT_ECHOREPLY 43 #define ICMP_EXT_CODE_MAL_QUERY 1 /* Malformed Query */ #define ICMP_EXT_CODE_NO_IF 2 /* No such Interface */ #define ICMP_EXT_CODE_NO_TABLE_ENT 3 /* No such Table Entry */ #define ICMP_EXT_CODE_MULT_IFS 4 /* Multiple Interfaces Satisfy Query */ /* Constants for EXT_ECHO (PROBE) */ #define ICMP_EXT_ECHOREPLY_ACTIVE (1 << 2)/* active bit in reply message */ #define ICMP_EXT_ECHOREPLY_IPV4 (1 << 1)/* ipv4 bit in reply message */ #define ICMP_EXT_ECHOREPLY_IPV6 1 /* ipv6 bit in reply message */ #define ICMP_EXT_ECHO_CTYPE_NAME 1 #define ICMP_EXT_ECHO_CTYPE_INDEX 2 #define ICMP_EXT_ECHO_CTYPE_ADDR 3 #define ICMP_AFI_IP 1 /* Address Family Identifier for ipv4 */ #define ICMP_AFI_IP6 2 /* Address Family Identifier for ipv6 */ struct icmphdr { __u8 type; __u8 code; __sum16 checksum; union { struct { __be16 id; __be16 sequence; } echo; __be32 gateway; struct { __be16 __unused; __be16 mtu; } frag; __u8 reserved[4]; } un; }; /* * constants for (set|get)sockopt */ #define ICMP_FILTER 1 struct icmp_filter { __u32 data; }; /* RFC 4884 extension struct: one per message */ struct icmp_ext_hdr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 reserved1:4, version:4; #elif defined(__BIG_ENDIAN_BITFIELD) __u8 version:4, reserved1:4; #else #error "Please fix " #endif __u8 reserved2; __sum16 checksum; }; /* RFC 4884 extension object header: one for each object */ struct icmp_extobj_hdr { __be16 length; __u8 class_num; __u8 class_type; }; /* RFC 8335: 2.1 Header for c-type 3 payload */ struct icmp_ext_echo_ctype3_hdr { __be16 afi; __u8 addrlen; __u8 reserved; }; /* RFC 8335: 2.1 Interface Identification Object */ struct icmp_ext_echo_iio { struct icmp_extobj_hdr extobj_hdr; union { char name[IFNAMSIZ]; __be32 ifindex; struct { struct icmp_ext_echo_ctype3_hdr ctype3_hdr; union { __be32 ipv4_addr; struct in6_addr ipv6_addr; } ip_addr; } addr; } ident; }; #endif /* _UAPI_LINUX_ICMP_H */ xdp-tools-1.6.1/headers/linux/if.h000066400000000000000000000004551514310632100167710ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* Truncated header from the kernel sources that just defines the name sizes * below; used by icmp.h */ #ifndef _LINUX_IF_H #define _LINUX_IF_H #define IFNAMSIZ 16 #define IFALIASZ 256 #define ALTIFNAMSIZ 128 #endif /* _LINUX_IF_H */ xdp-tools-1.6.1/headers/linux/if_link.h000066400000000000000000000745731514310632100200220ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_LINUX_IF_LINK_H #define _UAPI_LINUX_IF_LINK_H #include #include /* This struct should be in sync with struct rtnl_link_stats64 */ struct rtnl_link_stats { __u32 rx_packets; __u32 tx_packets; __u32 rx_bytes; __u32 tx_bytes; __u32 rx_errors; __u32 tx_errors; __u32 rx_dropped; __u32 tx_dropped; __u32 multicast; __u32 collisions; /* detailed rx_errors: */ __u32 rx_length_errors; __u32 rx_over_errors; __u32 rx_crc_errors; __u32 rx_frame_errors; __u32 rx_fifo_errors; __u32 rx_missed_errors; /* detailed tx_errors */ __u32 tx_aborted_errors; __u32 tx_carrier_errors; __u32 tx_fifo_errors; __u32 tx_heartbeat_errors; __u32 tx_window_errors; /* for cslip etc */ __u32 rx_compressed; __u32 tx_compressed; __u32 rx_nohandler; }; /** * struct rtnl_link_stats64 - The main device statistics structure. * * @rx_packets: Number of good packets received by the interface. * For hardware interfaces counts all good packets received from the device * by the host, including packets which host had to drop at various stages * of processing (even in the driver). * * @tx_packets: Number of packets successfully transmitted. * For hardware interfaces counts packets which host was able to successfully * hand over to the device, which does not necessarily mean that packets * had been successfully transmitted out of the device, only that device * acknowledged it copied them out of host memory. * * @rx_bytes: Number of good received bytes, corresponding to @rx_packets. * * For IEEE 802.3 devices should count the length of Ethernet Frames * excluding the FCS. * * @tx_bytes: Number of good transmitted bytes, corresponding to @tx_packets. * * For IEEE 802.3 devices should count the length of Ethernet Frames * excluding the FCS. * * @rx_errors: Total number of bad packets received on this network device. * This counter must include events counted by @rx_length_errors, * @rx_crc_errors, @rx_frame_errors and other errors not otherwise * counted. * * @tx_errors: Total number of transmit problems. * This counter must include events counter by @tx_aborted_errors, * @tx_carrier_errors, @tx_fifo_errors, @tx_heartbeat_errors, * @tx_window_errors and other errors not otherwise counted. * * @rx_dropped: Number of packets received but not processed, * e.g. due to lack of resources or unsupported protocol. * For hardware interfaces this counter may include packets discarded * due to L2 address filtering but should not include packets dropped * by the device due to buffer exhaustion which are counted separately in * @rx_missed_errors (since procfs folds those two counters together). * * @tx_dropped: Number of packets dropped on their way to transmission, * e.g. due to lack of resources. * * @multicast: Multicast packets received. * For hardware interfaces this statistic is commonly calculated * at the device level (unlike @rx_packets) and therefore may include * packets which did not reach the host. * * For IEEE 802.3 devices this counter may be equivalent to: * * - 30.3.1.1.21 aMulticastFramesReceivedOK * * @collisions: Number of collisions during packet transmissions. * * @rx_length_errors: Number of packets dropped due to invalid length. * Part of aggregate "frame" errors in `/proc/net/dev`. * * For IEEE 802.3 devices this counter should be equivalent to a sum * of the following attributes: * * - 30.3.1.1.23 aInRangeLengthErrors * - 30.3.1.1.24 aOutOfRangeLengthField * - 30.3.1.1.25 aFrameTooLongErrors * * @rx_over_errors: Receiver FIFO overflow event counter. * * Historically the count of overflow events. Such events may be * reported in the receive descriptors or via interrupts, and may * not correspond one-to-one with dropped packets. * * The recommended interpretation for high speed interfaces is - * number of packets dropped because they did not fit into buffers * provided by the host, e.g. packets larger than MTU or next buffer * in the ring was not available for a scatter transfer. * * Part of aggregate "frame" errors in `/proc/net/dev`. * * This statistics was historically used interchangeably with * @rx_fifo_errors. * * This statistic corresponds to hardware events and is not commonly used * on software devices. * * @rx_crc_errors: Number of packets received with a CRC error. * Part of aggregate "frame" errors in `/proc/net/dev`. * * For IEEE 802.3 devices this counter must be equivalent to: * * - 30.3.1.1.6 aFrameCheckSequenceErrors * * @rx_frame_errors: Receiver frame alignment errors. * Part of aggregate "frame" errors in `/proc/net/dev`. * * For IEEE 802.3 devices this counter should be equivalent to: * * - 30.3.1.1.7 aAlignmentErrors * * @rx_fifo_errors: Receiver FIFO error counter. * * Historically the count of overflow events. Those events may be * reported in the receive descriptors or via interrupts, and may * not correspond one-to-one with dropped packets. * * This statistics was used interchangeably with @rx_over_errors. * Not recommended for use in drivers for high speed interfaces. * * This statistic is used on software devices, e.g. to count software * packet queue overflow (can) or sequencing errors (GRE). * * @rx_missed_errors: Count of packets missed by the host. * Folded into the "drop" counter in `/proc/net/dev`. * * Counts number of packets dropped by the device due to lack * of buffer space. This usually indicates that the host interface * is slower than the network interface, or host is not keeping up * with the receive packet rate. * * This statistic corresponds to hardware events and is not used * on software devices. * * @tx_aborted_errors: * Part of aggregate "carrier" errors in `/proc/net/dev`. * For IEEE 802.3 devices capable of half-duplex operation this counter * must be equivalent to: * * - 30.3.1.1.11 aFramesAbortedDueToXSColls * * High speed interfaces may use this counter as a general device * discard counter. * * @tx_carrier_errors: Number of frame transmission errors due to loss * of carrier during transmission. * Part of aggregate "carrier" errors in `/proc/net/dev`. * * For IEEE 802.3 devices this counter must be equivalent to: * * - 30.3.1.1.13 aCarrierSenseErrors * * @tx_fifo_errors: Number of frame transmission errors due to device * FIFO underrun / underflow. This condition occurs when the device * begins transmission of a frame but is unable to deliver the * entire frame to the transmitter in time for transmission. * Part of aggregate "carrier" errors in `/proc/net/dev`. * * @tx_heartbeat_errors: Number of Heartbeat / SQE Test errors for * old half-duplex Ethernet. * Part of aggregate "carrier" errors in `/proc/net/dev`. * * For IEEE 802.3 devices possibly equivalent to: * * - 30.3.2.1.4 aSQETestErrors * * @tx_window_errors: Number of frame transmission errors due * to late collisions (for Ethernet - after the first 64B of transmission). * Part of aggregate "carrier" errors in `/proc/net/dev`. * * For IEEE 802.3 devices this counter must be equivalent to: * * - 30.3.1.1.10 aLateCollisions * * @rx_compressed: Number of correctly received compressed packets. * This counters is only meaningful for interfaces which support * packet compression (e.g. CSLIP, PPP). * * @tx_compressed: Number of transmitted compressed packets. * This counters is only meaningful for interfaces which support * packet compression (e.g. CSLIP, PPP). * * @rx_nohandler: Number of packets received on the interface * but dropped by the networking stack because the device is * not designated to receive packets (e.g. backup link in a bond). */ struct rtnl_link_stats64 { __u64 rx_packets; __u64 tx_packets; __u64 rx_bytes; __u64 tx_bytes; __u64 rx_errors; __u64 tx_errors; __u64 rx_dropped; __u64 tx_dropped; __u64 multicast; __u64 collisions; /* detailed rx_errors: */ __u64 rx_length_errors; __u64 rx_over_errors; __u64 rx_crc_errors; __u64 rx_frame_errors; __u64 rx_fifo_errors; __u64 rx_missed_errors; /* detailed tx_errors */ __u64 tx_aborted_errors; __u64 tx_carrier_errors; __u64 tx_fifo_errors; __u64 tx_heartbeat_errors; __u64 tx_window_errors; /* for cslip etc */ __u64 rx_compressed; __u64 tx_compressed; __u64 rx_nohandler; }; /* The struct should be in sync with struct ifmap */ struct rtnl_link_ifmap { __u64 mem_start; __u64 mem_end; __u64 base_addr; __u16 irq; __u8 dma; __u8 port; }; /* * IFLA_AF_SPEC * Contains nested attributes for address family specific attributes. * Each address family may create a attribute with the address family * number as type and create its own attribute structure in it. * * Example: * [IFLA_AF_SPEC] = { * [AF_INET] = { * [IFLA_INET_CONF] = ..., * }, * [AF_INET6] = { * [IFLA_INET6_FLAGS] = ..., * [IFLA_INET6_CONF] = ..., * } * } */ enum { IFLA_UNSPEC, IFLA_ADDRESS, IFLA_BROADCAST, IFLA_IFNAME, IFLA_MTU, IFLA_LINK, IFLA_QDISC, IFLA_STATS, IFLA_COST, #define IFLA_COST IFLA_COST IFLA_PRIORITY, #define IFLA_PRIORITY IFLA_PRIORITY IFLA_MASTER, #define IFLA_MASTER IFLA_MASTER IFLA_WIRELESS, /* Wireless Extension event - see wireless.h */ #define IFLA_WIRELESS IFLA_WIRELESS IFLA_PROTINFO, /* Protocol specific information for a link */ #define IFLA_PROTINFO IFLA_PROTINFO IFLA_TXQLEN, #define IFLA_TXQLEN IFLA_TXQLEN IFLA_MAP, #define IFLA_MAP IFLA_MAP IFLA_WEIGHT, #define IFLA_WEIGHT IFLA_WEIGHT IFLA_OPERSTATE, IFLA_LINKMODE, IFLA_LINKINFO, #define IFLA_LINKINFO IFLA_LINKINFO IFLA_NET_NS_PID, IFLA_IFALIAS, IFLA_NUM_VF, /* Number of VFs if device is SR-IOV PF */ IFLA_VFINFO_LIST, IFLA_STATS64, IFLA_VF_PORTS, IFLA_PORT_SELF, IFLA_AF_SPEC, IFLA_GROUP, /* Group the device belongs to */ IFLA_NET_NS_FD, IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */ #define IFLA_PROMISCUITY IFLA_PROMISCUITY IFLA_NUM_TX_QUEUES, IFLA_NUM_RX_QUEUES, IFLA_CARRIER, IFLA_PHYS_PORT_ID, IFLA_CARRIER_CHANGES, IFLA_PHYS_SWITCH_ID, IFLA_LINK_NETNSID, IFLA_PHYS_PORT_NAME, IFLA_PROTO_DOWN, IFLA_GSO_MAX_SEGS, IFLA_GSO_MAX_SIZE, IFLA_PAD, IFLA_XDP, IFLA_EVENT, IFLA_NEW_NETNSID, IFLA_IF_NETNSID, IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */ IFLA_CARRIER_UP_COUNT, IFLA_CARRIER_DOWN_COUNT, IFLA_NEW_IFINDEX, IFLA_MIN_MTU, IFLA_MAX_MTU, IFLA_PROP_LIST, IFLA_ALT_IFNAME, /* Alternative ifname */ IFLA_PERM_ADDRESS, IFLA_PROTO_DOWN_REASON, __IFLA_MAX }; #define IFLA_MAX (__IFLA_MAX - 1) enum { IFLA_PROTO_DOWN_REASON_UNSPEC, IFLA_PROTO_DOWN_REASON_MASK, /* u32, mask for reason bits */ IFLA_PROTO_DOWN_REASON_VALUE, /* u32, reason bit value */ __IFLA_PROTO_DOWN_REASON_CNT, IFLA_PROTO_DOWN_REASON_MAX = __IFLA_PROTO_DOWN_REASON_CNT - 1 }; /* backwards compatibility for userspace */ #ifndef __KERNEL__ #define IFLA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg)))) #define IFLA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifinfomsg)) #endif enum { IFLA_INET_UNSPEC, IFLA_INET_CONF, __IFLA_INET_MAX, }; #define IFLA_INET_MAX (__IFLA_INET_MAX - 1) /* ifi_flags. IFF_* flags. The only change is: IFF_LOOPBACK, IFF_BROADCAST and IFF_POINTOPOINT are more not changeable by user. They describe link media characteristics and set by device driver. Comments: - Combination IFF_BROADCAST|IFF_POINTOPOINT is invalid - If neither of these three flags are set; the interface is NBMA. - IFF_MULTICAST does not mean anything special: multicasts can be used on all not-NBMA links. IFF_MULTICAST means that this media uses special encapsulation for multicast frames. Apparently, all IFF_POINTOPOINT and IFF_BROADCAST devices are able to use multicasts too. */ /* IFLA_LINK. For usual devices it is equal ifi_index. If it is a "virtual interface" (f.e. tunnel), ifi_link can point to real physical interface (f.e. for bandwidth calculations), or maybe 0, what means, that real media is unknown (usual for IPIP tunnels, when route to endpoint is allowed to change) */ /* Subtype attributes for IFLA_PROTINFO */ enum { IFLA_INET6_UNSPEC, IFLA_INET6_FLAGS, /* link flags */ IFLA_INET6_CONF, /* sysctl parameters */ IFLA_INET6_STATS, /* statistics */ IFLA_INET6_MCAST, /* MC things. What of them? */ IFLA_INET6_CACHEINFO, /* time values and max reasm size */ IFLA_INET6_ICMP6STATS, /* statistics (icmpv6) */ IFLA_INET6_TOKEN, /* device token */ IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */ __IFLA_INET6_MAX }; #define IFLA_INET6_MAX (__IFLA_INET6_MAX - 1) enum in6_addr_gen_mode { IN6_ADDR_GEN_MODE_EUI64, IN6_ADDR_GEN_MODE_NONE, IN6_ADDR_GEN_MODE_STABLE_PRIVACY, IN6_ADDR_GEN_MODE_RANDOM, }; /* Bridge section */ enum { IFLA_BR_UNSPEC, IFLA_BR_FORWARD_DELAY, IFLA_BR_HELLO_TIME, IFLA_BR_MAX_AGE, IFLA_BR_AGEING_TIME, IFLA_BR_STP_STATE, IFLA_BR_PRIORITY, IFLA_BR_VLAN_FILTERING, IFLA_BR_VLAN_PROTOCOL, IFLA_BR_GROUP_FWD_MASK, IFLA_BR_ROOT_ID, IFLA_BR_BRIDGE_ID, IFLA_BR_ROOT_PORT, IFLA_BR_ROOT_PATH_COST, IFLA_BR_TOPOLOGY_CHANGE, IFLA_BR_TOPOLOGY_CHANGE_DETECTED, IFLA_BR_HELLO_TIMER, IFLA_BR_TCN_TIMER, IFLA_BR_TOPOLOGY_CHANGE_TIMER, IFLA_BR_GC_TIMER, IFLA_BR_GROUP_ADDR, IFLA_BR_FDB_FLUSH, IFLA_BR_MCAST_ROUTER, IFLA_BR_MCAST_SNOOPING, IFLA_BR_MCAST_QUERY_USE_IFADDR, IFLA_BR_MCAST_QUERIER, IFLA_BR_MCAST_HASH_ELASTICITY, IFLA_BR_MCAST_HASH_MAX, IFLA_BR_MCAST_LAST_MEMBER_CNT, IFLA_BR_MCAST_STARTUP_QUERY_CNT, IFLA_BR_MCAST_LAST_MEMBER_INTVL, IFLA_BR_MCAST_MEMBERSHIP_INTVL, IFLA_BR_MCAST_QUERIER_INTVL, IFLA_BR_MCAST_QUERY_INTVL, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, IFLA_BR_NF_CALL_IPTABLES, IFLA_BR_NF_CALL_IP6TABLES, IFLA_BR_NF_CALL_ARPTABLES, IFLA_BR_VLAN_DEFAULT_PVID, IFLA_BR_PAD, IFLA_BR_VLAN_STATS_ENABLED, IFLA_BR_MCAST_STATS_ENABLED, IFLA_BR_MCAST_IGMP_VERSION, IFLA_BR_MCAST_MLD_VERSION, IFLA_BR_VLAN_STATS_PER_PORT, IFLA_BR_MULTI_BOOLOPT, __IFLA_BR_MAX, }; #define IFLA_BR_MAX (__IFLA_BR_MAX - 1) struct ifla_bridge_id { __u8 prio[2]; __u8 addr[6]; /* ETH_ALEN */ }; enum { BRIDGE_MODE_UNSPEC, BRIDGE_MODE_HAIRPIN, }; enum { IFLA_BRPORT_UNSPEC, IFLA_BRPORT_STATE, /* Spanning tree state */ IFLA_BRPORT_PRIORITY, /* " priority */ IFLA_BRPORT_COST, /* " cost */ IFLA_BRPORT_MODE, /* mode (hairpin) */ IFLA_BRPORT_GUARD, /* bpdu guard */ IFLA_BRPORT_PROTECT, /* root port protection */ IFLA_BRPORT_FAST_LEAVE, /* multicast fast leave */ IFLA_BRPORT_LEARNING, /* mac learning */ IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */ IFLA_BRPORT_PROXYARP, /* proxy ARP */ IFLA_BRPORT_LEARNING_SYNC, /* mac learning sync from device */ IFLA_BRPORT_PROXYARP_WIFI, /* proxy ARP for Wi-Fi */ IFLA_BRPORT_ROOT_ID, /* designated root */ IFLA_BRPORT_BRIDGE_ID, /* designated bridge */ IFLA_BRPORT_DESIGNATED_PORT, IFLA_BRPORT_DESIGNATED_COST, IFLA_BRPORT_ID, IFLA_BRPORT_NO, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, IFLA_BRPORT_CONFIG_PENDING, IFLA_BRPORT_MESSAGE_AGE_TIMER, IFLA_BRPORT_FORWARD_DELAY_TIMER, IFLA_BRPORT_HOLD_TIMER, IFLA_BRPORT_FLUSH, IFLA_BRPORT_MULTICAST_ROUTER, IFLA_BRPORT_PAD, IFLA_BRPORT_MCAST_FLOOD, IFLA_BRPORT_MCAST_TO_UCAST, IFLA_BRPORT_VLAN_TUNNEL, IFLA_BRPORT_BCAST_FLOOD, IFLA_BRPORT_GROUP_FWD_MASK, IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, IFLA_BRPORT_MRP_RING_OPEN, IFLA_BRPORT_MRP_IN_OPEN, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) struct ifla_cacheinfo { __u32 max_reasm_len; __u32 tstamp; /* ipv6InterfaceTable updated timestamp */ __u32 reachable_time; __u32 retrans_time; }; enum { IFLA_INFO_UNSPEC, IFLA_INFO_KIND, IFLA_INFO_DATA, IFLA_INFO_XSTATS, IFLA_INFO_SLAVE_KIND, IFLA_INFO_SLAVE_DATA, __IFLA_INFO_MAX, }; #define IFLA_INFO_MAX (__IFLA_INFO_MAX - 1) /* VLAN section */ enum { IFLA_VLAN_UNSPEC, IFLA_VLAN_ID, IFLA_VLAN_FLAGS, IFLA_VLAN_EGRESS_QOS, IFLA_VLAN_INGRESS_QOS, IFLA_VLAN_PROTOCOL, __IFLA_VLAN_MAX, }; #define IFLA_VLAN_MAX (__IFLA_VLAN_MAX - 1) struct ifla_vlan_flags { __u32 flags; __u32 mask; }; enum { IFLA_VLAN_QOS_UNSPEC, IFLA_VLAN_QOS_MAPPING, __IFLA_VLAN_QOS_MAX }; #define IFLA_VLAN_QOS_MAX (__IFLA_VLAN_QOS_MAX - 1) struct ifla_vlan_qos_mapping { __u32 from; __u32 to; }; /* MACVLAN section */ enum { IFLA_MACVLAN_UNSPEC, IFLA_MACVLAN_MODE, IFLA_MACVLAN_FLAGS, IFLA_MACVLAN_MACADDR_MODE, IFLA_MACVLAN_MACADDR, IFLA_MACVLAN_MACADDR_DATA, IFLA_MACVLAN_MACADDR_COUNT, IFLA_MACVLAN_BC_QUEUE_LEN, IFLA_MACVLAN_BC_QUEUE_LEN_USED, __IFLA_MACVLAN_MAX, }; #define IFLA_MACVLAN_MAX (__IFLA_MACVLAN_MAX - 1) enum macvlan_mode { MACVLAN_MODE_PRIVATE = 1, /* don't talk to other macvlans */ MACVLAN_MODE_VEPA = 2, /* talk to other ports through ext bridge */ MACVLAN_MODE_BRIDGE = 4, /* talk to bridge ports directly */ MACVLAN_MODE_PASSTHRU = 8,/* take over the underlying device */ MACVLAN_MODE_SOURCE = 16,/* use source MAC address list to assign */ }; enum macvlan_macaddr_mode { MACVLAN_MACADDR_ADD, MACVLAN_MACADDR_DEL, MACVLAN_MACADDR_FLUSH, MACVLAN_MACADDR_SET, }; #define MACVLAN_FLAG_NOPROMISC 1 /* VRF section */ enum { IFLA_VRF_UNSPEC, IFLA_VRF_TABLE, __IFLA_VRF_MAX }; #define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1) enum { IFLA_VRF_PORT_UNSPEC, IFLA_VRF_PORT_TABLE, __IFLA_VRF_PORT_MAX }; #define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1) /* MACSEC section */ enum { IFLA_MACSEC_UNSPEC, IFLA_MACSEC_SCI, IFLA_MACSEC_PORT, IFLA_MACSEC_ICV_LEN, IFLA_MACSEC_CIPHER_SUITE, IFLA_MACSEC_WINDOW, IFLA_MACSEC_ENCODING_SA, IFLA_MACSEC_ENCRYPT, IFLA_MACSEC_PROTECT, IFLA_MACSEC_INC_SCI, IFLA_MACSEC_ES, IFLA_MACSEC_SCB, IFLA_MACSEC_REPLAY_PROTECT, IFLA_MACSEC_VALIDATION, IFLA_MACSEC_PAD, IFLA_MACSEC_OFFLOAD, __IFLA_MACSEC_MAX, }; #define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) /* XFRM section */ enum { IFLA_XFRM_UNSPEC, IFLA_XFRM_LINK, IFLA_XFRM_IF_ID, __IFLA_XFRM_MAX }; #define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1) enum macsec_validation_type { MACSEC_VALIDATE_DISABLED = 0, MACSEC_VALIDATE_CHECK = 1, MACSEC_VALIDATE_STRICT = 2, __MACSEC_VALIDATE_END, MACSEC_VALIDATE_MAX = __MACSEC_VALIDATE_END - 1, }; enum macsec_offload { MACSEC_OFFLOAD_OFF = 0, MACSEC_OFFLOAD_PHY = 1, MACSEC_OFFLOAD_MAC = 2, __MACSEC_OFFLOAD_END, MACSEC_OFFLOAD_MAX = __MACSEC_OFFLOAD_END - 1, }; /* IPVLAN section */ enum { IFLA_IPVLAN_UNSPEC, IFLA_IPVLAN_MODE, IFLA_IPVLAN_FLAGS, __IFLA_IPVLAN_MAX }; #define IFLA_IPVLAN_MAX (__IFLA_IPVLAN_MAX - 1) enum ipvlan_mode { IPVLAN_MODE_L2 = 0, IPVLAN_MODE_L3, IPVLAN_MODE_L3S, IPVLAN_MODE_MAX }; #define IPVLAN_F_PRIVATE 0x01 #define IPVLAN_F_VEPA 0x02 /* VXLAN section */ enum { IFLA_VXLAN_UNSPEC, IFLA_VXLAN_ID, IFLA_VXLAN_GROUP, /* group or remote address */ IFLA_VXLAN_LINK, IFLA_VXLAN_LOCAL, IFLA_VXLAN_TTL, IFLA_VXLAN_TOS, IFLA_VXLAN_LEARNING, IFLA_VXLAN_AGEING, IFLA_VXLAN_LIMIT, IFLA_VXLAN_PORT_RANGE, /* source port */ IFLA_VXLAN_PROXY, IFLA_VXLAN_RSC, IFLA_VXLAN_L2MISS, IFLA_VXLAN_L3MISS, IFLA_VXLAN_PORT, /* destination port */ IFLA_VXLAN_GROUP6, IFLA_VXLAN_LOCAL6, IFLA_VXLAN_UDP_CSUM, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, IFLA_VXLAN_REMCSUM_TX, IFLA_VXLAN_REMCSUM_RX, IFLA_VXLAN_GBP, IFLA_VXLAN_REMCSUM_NOPARTIAL, IFLA_VXLAN_COLLECT_METADATA, IFLA_VXLAN_LABEL, IFLA_VXLAN_GPE, IFLA_VXLAN_TTL_INHERIT, IFLA_VXLAN_DF, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) struct ifla_vxlan_port_range { __be16 low; __be16 high; }; enum ifla_vxlan_df { VXLAN_DF_UNSET = 0, VXLAN_DF_SET, VXLAN_DF_INHERIT, __VXLAN_DF_END, VXLAN_DF_MAX = __VXLAN_DF_END - 1, }; /* GENEVE section */ enum { IFLA_GENEVE_UNSPEC, IFLA_GENEVE_ID, IFLA_GENEVE_REMOTE, IFLA_GENEVE_TTL, IFLA_GENEVE_TOS, IFLA_GENEVE_PORT, /* destination port */ IFLA_GENEVE_COLLECT_METADATA, IFLA_GENEVE_REMOTE6, IFLA_GENEVE_UDP_CSUM, IFLA_GENEVE_UDP_ZERO_CSUM6_TX, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, IFLA_GENEVE_LABEL, IFLA_GENEVE_TTL_INHERIT, IFLA_GENEVE_DF, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) enum ifla_geneve_df { GENEVE_DF_UNSET = 0, GENEVE_DF_SET, GENEVE_DF_INHERIT, __GENEVE_DF_END, GENEVE_DF_MAX = __GENEVE_DF_END - 1, }; /* Bareudp section */ enum { IFLA_BAREUDP_UNSPEC, IFLA_BAREUDP_PORT, IFLA_BAREUDP_ETHERTYPE, IFLA_BAREUDP_SRCPORT_MIN, IFLA_BAREUDP_MULTIPROTO_MODE, __IFLA_BAREUDP_MAX }; #define IFLA_BAREUDP_MAX (__IFLA_BAREUDP_MAX - 1) /* PPP section */ enum { IFLA_PPP_UNSPEC, IFLA_PPP_DEV_FD, __IFLA_PPP_MAX }; #define IFLA_PPP_MAX (__IFLA_PPP_MAX - 1) /* GTP section */ enum ifla_gtp_role { GTP_ROLE_GGSN = 0, GTP_ROLE_SGSN, }; enum { IFLA_GTP_UNSPEC, IFLA_GTP_FD0, IFLA_GTP_FD1, IFLA_GTP_PDP_HASHSIZE, IFLA_GTP_ROLE, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) /* Bonding section */ enum { IFLA_BOND_UNSPEC, IFLA_BOND_MODE, IFLA_BOND_ACTIVE_SLAVE, IFLA_BOND_MIIMON, IFLA_BOND_UPDELAY, IFLA_BOND_DOWNDELAY, IFLA_BOND_USE_CARRIER, IFLA_BOND_ARP_INTERVAL, IFLA_BOND_ARP_IP_TARGET, IFLA_BOND_ARP_VALIDATE, IFLA_BOND_ARP_ALL_TARGETS, IFLA_BOND_PRIMARY, IFLA_BOND_PRIMARY_RESELECT, IFLA_BOND_FAIL_OVER_MAC, IFLA_BOND_XMIT_HASH_POLICY, IFLA_BOND_RESEND_IGMP, IFLA_BOND_NUM_PEER_NOTIF, IFLA_BOND_ALL_SLAVES_ACTIVE, IFLA_BOND_MIN_LINKS, IFLA_BOND_LP_INTERVAL, IFLA_BOND_PACKETS_PER_SLAVE, IFLA_BOND_AD_LACP_RATE, IFLA_BOND_AD_SELECT, IFLA_BOND_AD_INFO, IFLA_BOND_AD_ACTOR_SYS_PRIO, IFLA_BOND_AD_USER_PORT_KEY, IFLA_BOND_AD_ACTOR_SYSTEM, IFLA_BOND_TLB_DYNAMIC_LB, IFLA_BOND_PEER_NOTIF_DELAY, __IFLA_BOND_MAX, }; #define IFLA_BOND_MAX (__IFLA_BOND_MAX - 1) enum { IFLA_BOND_AD_INFO_UNSPEC, IFLA_BOND_AD_INFO_AGGREGATOR, IFLA_BOND_AD_INFO_NUM_PORTS, IFLA_BOND_AD_INFO_ACTOR_KEY, IFLA_BOND_AD_INFO_PARTNER_KEY, IFLA_BOND_AD_INFO_PARTNER_MAC, __IFLA_BOND_AD_INFO_MAX, }; #define IFLA_BOND_AD_INFO_MAX (__IFLA_BOND_AD_INFO_MAX - 1) enum { IFLA_BOND_SLAVE_UNSPEC, IFLA_BOND_SLAVE_STATE, IFLA_BOND_SLAVE_MII_STATUS, IFLA_BOND_SLAVE_LINK_FAILURE_COUNT, IFLA_BOND_SLAVE_PERM_HWADDR, IFLA_BOND_SLAVE_QUEUE_ID, IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, __IFLA_BOND_SLAVE_MAX, }; #define IFLA_BOND_SLAVE_MAX (__IFLA_BOND_SLAVE_MAX - 1) /* SR-IOV virtual function management section */ enum { IFLA_VF_INFO_UNSPEC, IFLA_VF_INFO, __IFLA_VF_INFO_MAX, }; #define IFLA_VF_INFO_MAX (__IFLA_VF_INFO_MAX - 1) enum { IFLA_VF_UNSPEC, IFLA_VF_MAC, /* Hardware queue specific attributes */ IFLA_VF_VLAN, /* VLAN ID and QoS */ IFLA_VF_TX_RATE, /* Max TX Bandwidth Allocation */ IFLA_VF_SPOOFCHK, /* Spoof Checking on/off switch */ IFLA_VF_LINK_STATE, /* link state enable/disable/auto switch */ IFLA_VF_RATE, /* Min and Max TX Bandwidth Allocation */ IFLA_VF_RSS_QUERY_EN, /* RSS Redirection Table and Hash Key query * on/off switch */ IFLA_VF_STATS, /* network device statistics */ IFLA_VF_TRUST, /* Trust VF */ IFLA_VF_IB_NODE_GUID, /* VF Infiniband node GUID */ IFLA_VF_IB_PORT_GUID, /* VF Infiniband port GUID */ IFLA_VF_VLAN_LIST, /* nested list of vlans, option for QinQ */ IFLA_VF_BROADCAST, /* VF broadcast */ __IFLA_VF_MAX, }; #define IFLA_VF_MAX (__IFLA_VF_MAX - 1) struct ifla_vf_mac { __u32 vf; __u8 mac[32]; /* MAX_ADDR_LEN */ }; struct ifla_vf_broadcast { __u8 broadcast[32]; }; struct ifla_vf_vlan { __u32 vf; __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ __u32 qos; }; enum { IFLA_VF_VLAN_INFO_UNSPEC, IFLA_VF_VLAN_INFO, /* VLAN ID, QoS and VLAN protocol */ __IFLA_VF_VLAN_INFO_MAX, }; #define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1) #define MAX_VLAN_LIST_LEN 1 struct ifla_vf_vlan_info { __u32 vf; __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ __u32 qos; __be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */ }; struct ifla_vf_tx_rate { __u32 vf; __u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */ }; struct ifla_vf_rate { __u32 vf; __u32 min_tx_rate; /* Min Bandwidth in Mbps */ __u32 max_tx_rate; /* Max Bandwidth in Mbps */ }; struct ifla_vf_spoofchk { __u32 vf; __u32 setting; }; struct ifla_vf_guid { __u32 vf; __u64 guid; }; enum { IFLA_VF_LINK_STATE_AUTO, /* link state of the uplink */ IFLA_VF_LINK_STATE_ENABLE, /* link always up */ IFLA_VF_LINK_STATE_DISABLE, /* link always down */ __IFLA_VF_LINK_STATE_MAX, }; struct ifla_vf_link_state { __u32 vf; __u32 link_state; }; struct ifla_vf_rss_query_en { __u32 vf; __u32 setting; }; enum { IFLA_VF_STATS_RX_PACKETS, IFLA_VF_STATS_TX_PACKETS, IFLA_VF_STATS_RX_BYTES, IFLA_VF_STATS_TX_BYTES, IFLA_VF_STATS_BROADCAST, IFLA_VF_STATS_MULTICAST, IFLA_VF_STATS_PAD, IFLA_VF_STATS_RX_DROPPED, IFLA_VF_STATS_TX_DROPPED, __IFLA_VF_STATS_MAX, }; #define IFLA_VF_STATS_MAX (__IFLA_VF_STATS_MAX - 1) struct ifla_vf_trust { __u32 vf; __u32 setting; }; /* VF ports management section * * Nested layout of set/get msg is: * * [IFLA_NUM_VF] * [IFLA_VF_PORTS] * [IFLA_VF_PORT] * [IFLA_PORT_*], ... * [IFLA_VF_PORT] * [IFLA_PORT_*], ... * ... * [IFLA_PORT_SELF] * [IFLA_PORT_*], ... */ enum { IFLA_VF_PORT_UNSPEC, IFLA_VF_PORT, /* nest */ __IFLA_VF_PORT_MAX, }; #define IFLA_VF_PORT_MAX (__IFLA_VF_PORT_MAX - 1) enum { IFLA_PORT_UNSPEC, IFLA_PORT_VF, /* __u32 */ IFLA_PORT_PROFILE, /* string */ IFLA_PORT_VSI_TYPE, /* 802.1Qbg (pre-)standard VDP */ IFLA_PORT_INSTANCE_UUID, /* binary UUID */ IFLA_PORT_HOST_UUID, /* binary UUID */ IFLA_PORT_REQUEST, /* __u8 */ IFLA_PORT_RESPONSE, /* __u16, output only */ __IFLA_PORT_MAX, }; #define IFLA_PORT_MAX (__IFLA_PORT_MAX - 1) #define PORT_PROFILE_MAX 40 #define PORT_UUID_MAX 16 #define PORT_SELF_VF -1 enum { PORT_REQUEST_PREASSOCIATE = 0, PORT_REQUEST_PREASSOCIATE_RR, PORT_REQUEST_ASSOCIATE, PORT_REQUEST_DISASSOCIATE, }; enum { PORT_VDP_RESPONSE_SUCCESS = 0, PORT_VDP_RESPONSE_INVALID_FORMAT, PORT_VDP_RESPONSE_INSUFFICIENT_RESOURCES, PORT_VDP_RESPONSE_UNUSED_VTID, PORT_VDP_RESPONSE_VTID_VIOLATION, PORT_VDP_RESPONSE_VTID_VERSION_VIOALTION, PORT_VDP_RESPONSE_OUT_OF_SYNC, /* 0x08-0xFF reserved for future VDP use */ PORT_PROFILE_RESPONSE_SUCCESS = 0x100, PORT_PROFILE_RESPONSE_INPROGRESS, PORT_PROFILE_RESPONSE_INVALID, PORT_PROFILE_RESPONSE_BADSTATE, PORT_PROFILE_RESPONSE_INSUFFICIENT_RESOURCES, PORT_PROFILE_RESPONSE_ERROR, }; struct ifla_port_vsi { __u8 vsi_mgr_id; __u8 vsi_type_id[3]; __u8 vsi_type_version; __u8 pad[3]; }; /* IPoIB section */ enum { IFLA_IPOIB_UNSPEC, IFLA_IPOIB_PKEY, IFLA_IPOIB_MODE, IFLA_IPOIB_UMCAST, __IFLA_IPOIB_MAX }; enum { IPOIB_MODE_DATAGRAM = 0, /* using unreliable datagram QPs */ IPOIB_MODE_CONNECTED = 1, /* using connected QPs */ }; #define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1) /* HSR/PRP section, both uses same interface */ /* Different redundancy protocols for hsr device */ enum { HSR_PROTOCOL_HSR, HSR_PROTOCOL_PRP, HSR_PROTOCOL_MAX, }; enum { IFLA_HSR_UNSPEC, IFLA_HSR_SLAVE1, IFLA_HSR_SLAVE2, IFLA_HSR_MULTICAST_SPEC, /* Last byte of supervision addr */ IFLA_HSR_SUPERVISION_ADDR, /* Supervision frame multicast addr */ IFLA_HSR_SEQ_NR, IFLA_HSR_VERSION, /* HSR version */ IFLA_HSR_PROTOCOL, /* Indicate different protocol than * HSR. For example PRP. */ __IFLA_HSR_MAX, }; #define IFLA_HSR_MAX (__IFLA_HSR_MAX - 1) /* STATS section */ struct if_stats_msg { __u8 family; __u8 pad1; __u16 pad2; __u32 ifindex; __u32 filter_mask; }; /* A stats attribute can be netdev specific or a global stat. * For netdev stats, lets use the prefix IFLA_STATS_LINK_* */ enum { IFLA_STATS_UNSPEC, /* also used as 64bit pad attribute */ IFLA_STATS_LINK_64, IFLA_STATS_LINK_XSTATS, IFLA_STATS_LINK_XSTATS_SLAVE, IFLA_STATS_LINK_OFFLOAD_XSTATS, IFLA_STATS_AF_SPEC, __IFLA_STATS_MAX, }; #define IFLA_STATS_MAX (__IFLA_STATS_MAX - 1) #define IFLA_STATS_FILTER_BIT(ATTR) (1 << (ATTR - 1)) /* These are embedded into IFLA_STATS_LINK_XSTATS: * [IFLA_STATS_LINK_XSTATS] * -> [LINK_XSTATS_TYPE_xxx] * -> [rtnl link type specific attributes] */ enum { LINK_XSTATS_TYPE_UNSPEC, LINK_XSTATS_TYPE_BRIDGE, LINK_XSTATS_TYPE_BOND, __LINK_XSTATS_TYPE_MAX }; #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1) /* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */ enum { IFLA_OFFLOAD_XSTATS_UNSPEC, IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */ __IFLA_OFFLOAD_XSTATS_MAX }; #define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1) /* XDP section */ #define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) #define XDP_FLAGS_SKB_MODE (1U << 1) #define XDP_FLAGS_DRV_MODE (1U << 2) #define XDP_FLAGS_HW_MODE (1U << 3) #define XDP_FLAGS_REPLACE (1U << 4) #define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ XDP_FLAGS_DRV_MODE | \ XDP_FLAGS_HW_MODE) #define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ XDP_FLAGS_MODES | XDP_FLAGS_REPLACE) /* These are stored into IFLA_XDP_ATTACHED on dump. */ enum { XDP_ATTACHED_NONE = 0, XDP_ATTACHED_DRV, XDP_ATTACHED_SKB, XDP_ATTACHED_HW, XDP_ATTACHED_MULTI, }; enum { IFLA_XDP_UNSPEC, IFLA_XDP_FD, IFLA_XDP_ATTACHED, IFLA_XDP_FLAGS, IFLA_XDP_PROG_ID, IFLA_XDP_DRV_PROG_ID, IFLA_XDP_SKB_PROG_ID, IFLA_XDP_HW_PROG_ID, IFLA_XDP_EXPECTED_FD, __IFLA_XDP_MAX, }; #define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1) enum { IFLA_EVENT_NONE, IFLA_EVENT_REBOOT, /* internal reset / reboot */ IFLA_EVENT_FEATURES, /* change in offload features */ IFLA_EVENT_BONDING_FAILOVER, /* change in active slave */ IFLA_EVENT_NOTIFY_PEERS, /* re-sent grat. arp/ndisc */ IFLA_EVENT_IGMP_RESEND, /* re-sent IGMP JOIN */ IFLA_EVENT_BONDING_OPTIONS, /* change in bonding options */ }; /* tun section */ enum { IFLA_TUN_UNSPEC, IFLA_TUN_OWNER, IFLA_TUN_GROUP, IFLA_TUN_TYPE, IFLA_TUN_PI, IFLA_TUN_VNET_HDR, IFLA_TUN_PERSIST, IFLA_TUN_MULTI_QUEUE, IFLA_TUN_NUM_QUEUES, IFLA_TUN_NUM_DISABLED_QUEUES, __IFLA_TUN_MAX, }; #define IFLA_TUN_MAX (__IFLA_TUN_MAX - 1) /* rmnet section */ #define RMNET_FLAGS_INGRESS_DEAGGREGATION (1U << 0) #define RMNET_FLAGS_INGRESS_MAP_COMMANDS (1U << 1) #define RMNET_FLAGS_INGRESS_MAP_CKSUMV4 (1U << 2) #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4 (1U << 3) enum { IFLA_RMNET_UNSPEC, IFLA_RMNET_MUX_ID, IFLA_RMNET_FLAGS, __IFLA_RMNET_MAX, }; #define IFLA_RMNET_MAX (__IFLA_RMNET_MAX - 1) struct ifla_rmnet_flags { __u32 flags; __u32 mask; }; #endif /* _UAPI_LINUX_IF_LINK_H */ xdp-tools-1.6.1/headers/linux/if_xdp.h000066400000000000000000000070671514310632100176520ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * if_xdp: XDP socket user-space interface * Copyright(c) 2018 Intel Corporation. * * Author(s): Björn Töpel * Magnus Karlsson */ #ifndef _LINUX_IF_XDP_H #define _LINUX_IF_XDP_H #include /* Options for the sxdp_flags field */ #define XDP_SHARED_UMEM (1 << 0) #define XDP_COPY (1 << 1) /* Force copy-mode */ #define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */ /* If this option is set, the driver might go sleep and in that case * the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be * set. If it is set, the application need to explicitly wake up the * driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are * running the driver and the application on the same core, you should * use this option so that the kernel will yield to the user space * application. */ #define XDP_USE_NEED_WAKEUP (1 << 3) /* By setting this option, userspace application indicates that it can * handle multiple descriptors per packet thus enabling AF_XDP to split * multi-buffer XDP frames into multiple Rx descriptors. Without this set * such frames will be dropped. */ #define XDP_USE_SG (1 << 4) /* Flags for xsk_umem_config flags */ #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) struct sockaddr_xdp { __u16 sxdp_family; __u16 sxdp_flags; __u32 sxdp_ifindex; __u32 sxdp_queue_id; __u32 sxdp_shared_umem_fd; }; /* XDP_RING flags */ #define XDP_RING_NEED_WAKEUP (1 << 0) struct xdp_ring_offset { __u64 producer; __u64 consumer; __u64 desc; __u64 flags; }; struct xdp_mmap_offsets { struct xdp_ring_offset rx; struct xdp_ring_offset tx; struct xdp_ring_offset fr; /* Fill */ struct xdp_ring_offset cr; /* Completion */ }; /* XDP socket options */ #define XDP_MMAP_OFFSETS 1 #define XDP_RX_RING 2 #define XDP_TX_RING 3 #define XDP_UMEM_REG 4 #define XDP_UMEM_FILL_RING 5 #define XDP_UMEM_COMPLETION_RING 6 #define XDP_STATISTICS 7 #define XDP_OPTIONS 8 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ __u64 len; /* Length of packet data area */ __u32 chunk_size; __u32 headroom; __u32 flags; __u32 tx_metadata_len; }; struct xdp_statistics { __u64 rx_dropped; /* Dropped for other reasons */ __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ __u64 rx_ring_full; /* Dropped due to rx ring being full */ __u64 rx_fill_ring_empty_descs; /* Failed to retrieve item from fill ring */ __u64 tx_ring_empty_descs; /* Failed to retrieve item from tx ring */ }; struct xdp_options { __u32 flags; }; /* Flags for the flags field of struct xdp_options */ #define XDP_OPTIONS_ZEROCOPY (1 << 0) /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_PGOFF_TX_RING 0x80000000 #define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL #define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL /* Masks for unaligned chunks mode */ #define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48 #define XSK_UNALIGNED_BUF_ADDR_MASK \ ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1) /* Rx/Tx descriptor */ struct xdp_desc { __u64 addr; __u32 len; __u32 options; }; /* UMEM descriptor is __u64 */ /* Flag indicating that the packet continues with the buffer pointed out by the * next frame in the ring. The end of the packet is signalled by setting this * bit to zero. For single buffer packets, every descriptor has 'options' set * to 0 and this maintains backward compatibility. */ #define XDP_PKT_CONTD (1 << 0) #endif /* _LINUX_IF_XDP_H */ xdp-tools-1.6.1/headers/linux/jhash.h000066400000000000000000000106471514310632100174740ustar00rootroot00000000000000#ifndef _LINUX_JHASH_H #define _LINUX_JHASH_H /* Copied from $(LINUX)/include/linux/jhash.h (kernel 4.18) */ /* jhash.h: Jenkins hash support. * * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) * * http://burtleburtle.net/bob/hash/ * * These are the credits from Bob's sources: * * lookup3.c, by Bob Jenkins, May 2006, Public Domain. * * These are functions for producing 32-bit hashes for hash table lookup. * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() * are externally useful functions. Routines to test the hash are included * if SELF_TEST is defined. You can use this free for any purpose. It's in * the public domain. It has no warranty. * * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu) */ static inline __u32 rol32(__u32 word, unsigned int shift) { return (word << shift) | (word >> ((-shift) & 31)); } /* copy paste of jhash from kernel sources (include/linux/jhash.h) to make sure * LLVM can compile it into valid sequence of BPF instructions */ #define __jhash_mix(a, b, c) \ { \ a -= c; a ^= rol32(c, 4); c += b; \ b -= a; b ^= rol32(a, 6); a += c; \ c -= b; c ^= rol32(b, 8); b += a; \ a -= c; a ^= rol32(c, 16); c += b; \ b -= a; b ^= rol32(a, 19); a += c; \ c -= b; c ^= rol32(b, 4); b += a; \ } #define __jhash_final(a, b, c) \ { \ c ^= b; c -= rol32(b, 14); \ a ^= c; a -= rol32(c, 11); \ b ^= a; b -= rol32(a, 25); \ c ^= b; c -= rol32(b, 16); \ a ^= c; a -= rol32(c, 4); \ b ^= a; b -= rol32(a, 14); \ c ^= b; c -= rol32(b, 24); \ } #define JHASH_INITVAL 0xdeadbeef typedef unsigned int u32; /* jhash - hash an arbitrary key * @k: sequence of bytes as key * @length: the length of the key * @initval: the previous hash, or an arbitray value * * The generic version, hashes an arbitrary sequence of bytes. * No alignment or length assumptions are made about the input key. * * Returns the hash value of the key. The result depends on endianness. */ static inline u32 jhash(const void *key, u32 length, u32 initval) { u32 a, b, c; const unsigned char *k = key; /* Set up the internal state */ a = b = c = JHASH_INITVAL + length + initval; /* All but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { a += *(u32 *)(k); b += *(u32 *)(k + 4); c += *(u32 *)(k + 8); __jhash_mix(a, b, c); length -= 12; k += 12; } /* Last block: affect all 32 bits of (c) */ switch (length) { case 12: c += (u32)k[11]<<24; /* fall through */ case 11: c += (u32)k[10]<<16; /* fall through */ case 10: c += (u32)k[9]<<8; /* fall through */ case 9: c += k[8]; /* fall through */ case 8: b += (u32)k[7]<<24; /* fall through */ case 7: b += (u32)k[6]<<16; /* fall through */ case 6: b += (u32)k[5]<<8; /* fall through */ case 5: b += k[4]; /* fall through */ case 4: a += (u32)k[3]<<24; /* fall through */ case 3: a += (u32)k[2]<<16; /* fall through */ case 2: a += (u32)k[1]<<8; /* fall through */ case 1: a += k[0]; __jhash_final(a, b, c); case 0: /* Nothing left to add */ break; } return c; } /* jhash2 - hash an array of u32's * @k: the key which must be an array of u32's * @length: the number of u32's in the key * @initval: the previous hash, or an arbitray value * * Returns the hash value of the key. */ static inline u32 jhash2(const u32 *k, u32 length, u32 initval) { u32 a, b, c; /* Set up the internal state */ a = b = c = JHASH_INITVAL + (length<<2) + initval; /* Handle most of the key */ while (length > 3) { a += k[0]; b += k[1]; c += k[2]; __jhash_mix(a, b, c); length -= 3; k += 3; } /* Handle the last 3 u32's */ switch (length) { case 3: c += k[2]; /* fall through */ case 2: b += k[1]; /* fall through */ case 1: a += k[0]; __jhash_final(a, b, c); case 0: /* Nothing left to add */ break; } return c; } /* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */ static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) { a += initval; b += initval; c += initval; __jhash_final(a, b, c); return c; } static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval) { return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2)); } static inline u32 jhash_2words(u32 a, u32 b, u32 initval) { return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); } static inline u32 jhash_1word(u32 a, u32 initval) { return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2)); } #endif /* _LINUX_JHASH_H */ xdp-tools-1.6.1/headers/linux/list.h000066400000000000000000000047401514310632100173470ustar00rootroot00000000000000/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ #ifndef __LINUX_LIST_H #define __LINUX_LIST_H struct list_head { struct list_head *next, *prev; }; #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) #define POISON_POINTER_DELTA 0 #define LIST_POISON1 ((void *) 0x100 + POISON_POINTER_DELTA) #define LIST_POISON2 ((void *) 0x200 + POISON_POINTER_DELTA) static inline void INIT_LIST_HEAD(struct list_head *list) { list->next = list; list->prev = list; } static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { next->prev = new; new->next = next; new->prev = prev; prev->next = new; } /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; prev->next = next; } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ static inline void __list_del_entry(struct list_head *entry) { __list_del(entry->prev, entry->next); } static inline void list_del(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } static inline int list_empty(const struct list_head *head) { return head->next == head; } #define list_entry(ptr, type, member) \ container_of(ptr, type, member) #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ &pos->member != (head); \ pos = list_next_entry(pos, member)) #endif xdp-tools-1.6.1/headers/linux/netdev.h000066400000000000000000000033541514310632100176610ustar00rootroot00000000000000/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/netdev.yaml */ /* YNL-GEN uapi header */ #ifndef _UAPI_LINUX_NETDEV_H #define _UAPI_LINUX_NETDEV_H #define NETDEV_FAMILY_NAME "netdev" #define NETDEV_FAMILY_VERSION 1 /** * enum netdev_xdp_act * @NETDEV_XDP_ACT_BASIC: XDP feautues set supported by all drivers * (XDP_ABORTED, XDP_DROP, XDP_PASS, XDP_TX) * @NETDEV_XDP_ACT_REDIRECT: The netdev supports XDP_REDIRECT * @NETDEV_XDP_ACT_NDO_XMIT: This feature informs if netdev implements * ndo_xdp_xmit callback. * @NETDEV_XDP_ACT_XSK_ZEROCOPY: This feature informs if netdev supports AF_XDP * in zero copy mode. * @NETDEV_XDP_ACT_HW_OFFLOAD: This feature informs if netdev supports XDP hw * offloading. * @NETDEV_XDP_ACT_RX_SG: This feature informs if netdev implements non-linear * XDP buffer support in the driver napi callback. * @NETDEV_XDP_ACT_NDO_XMIT_SG: This feature informs if netdev implements * non-linear XDP buffer support in ndo_xdp_xmit callback. */ enum netdev_xdp_act { NETDEV_XDP_ACT_BASIC = 1, NETDEV_XDP_ACT_REDIRECT = 2, NETDEV_XDP_ACT_NDO_XMIT = 4, NETDEV_XDP_ACT_XSK_ZEROCOPY = 8, NETDEV_XDP_ACT_HW_OFFLOAD = 16, NETDEV_XDP_ACT_RX_SG = 32, NETDEV_XDP_ACT_NDO_XMIT_SG = 64, NETDEV_XDP_ACT_MASK = 127, }; enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, NETDEV_A_DEV_XDP_FEATURES, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) }; enum { NETDEV_CMD_DEV_GET = 1, NETDEV_CMD_DEV_ADD_NTF, NETDEV_CMD_DEV_DEL_NTF, NETDEV_CMD_DEV_CHANGE_NTF, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) }; #define NETDEV_MCGRP_MGMT "mgmt" #endif /* _UAPI_LINUX_NETDEV_H */ xdp-tools-1.6.1/headers/linux/netfilter.h000066400000000000000000000037571514310632100203770ustar00rootroot00000000000000#ifndef _LINUX_NETFILTER_H #define _LINUX_NETFILTER_H #include #include #include #include #include "hlist.h" struct flow_ports { __be16 source, dest; }; enum ip_conntrack_dir { IP_CT_DIR_ORIGINAL, IP_CT_DIR_REPLY, IP_CT_DIR_MAX }; enum flow_offload_tuple_dir { FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL, FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY, FLOW_OFFLOAD_DIR_MAX = IP_CT_DIR_MAX, }; enum flow_offload_type { NF_FLOW_OFFLOAD_UNSPEC, NF_FLOW_OFFLOAD_ROUTE, }; enum nf_flow_flags { NF_FLOW_SNAT, NF_FLOW_DNAT, NF_FLOW_TEARDOWN, NF_FLOW_HW, NF_FLOW_HW_DYING, NF_FLOW_HW_DEAD, NF_FLOW_HW_PENDING, NF_FLOW_HW_BIDIRECTIONAL, NF_FLOW_HW_ESTABLISHED, }; enum flow_offload_xmit_type { FLOW_OFFLOAD_XMIT_UNSPEC, FLOW_OFFLOAD_XMIT_NEIGH, FLOW_OFFLOAD_XMIT_XFRM, FLOW_OFFLOAD_XMIT_DIRECT, FLOW_OFFLOAD_XMIT_TC, }; #define NF_FLOW_TABLE_ENCAP_MAX 2 struct flow_offload_tuple { union { struct in_addr src_v4; struct in6_addr src_v6; }; union { struct in_addr dst_v4; struct in6_addr dst_v6; }; struct { __be16 src_port; __be16 dst_port; }; int iifidx; __u8 l3proto; __u8 l4proto; struct { __u16 id; __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; /* All members above are keys for lookups, see flow_offload_hash(). */ struct { } __hash; __u8 dir:2, xmit_type:3, encap_num:2, in_vlan_ingress:2; __u16 mtu; union { struct { struct dst_entry *dst_cache; __u32 dst_cookie; }; struct { __u32 ifidx; __u32 hw_ifidx; __u8 h_source[ETH_ALEN]; __u8 h_dest[ETH_ALEN]; } out; struct { __u32 iifidx; } tc; }; }; struct flow_offload_tuple_rhash { struct rhash_head node; struct flow_offload_tuple tuple; }; struct flow_offload { struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; struct nf_conn *ct; unsigned long flags; __u16 type; __u32 timeout; }; #endif /* _LINUX_NETFILTER_H */ xdp-tools-1.6.1/headers/linux/perf-sys.h000066400000000000000000000026521514310632100201440ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ /* Copied from $(LINUX)/tools/perf/perf-sys.h (kernel 4.18) */ #ifndef _PERF_SYS_H #define _PERF_SYS_H #include #include #include #include #include /* * remove the following headers to allow for userspace program compilation * #include * #include */ #ifdef __powerpc__ #define CPUINFO_PROC {"cpu"} #endif #ifdef __s390__ #define CPUINFO_PROC {"vendor_id"} #endif #ifdef __sh__ #define CPUINFO_PROC {"cpu type"} #endif #ifdef __hppa__ #define CPUINFO_PROC {"cpu"} #endif #ifdef __sparc__ #define CPUINFO_PROC {"cpu"} #endif #ifdef __alpha__ #define CPUINFO_PROC {"cpu model"} #endif #ifdef __arm__ #define CPUINFO_PROC {"model name", "Processor"} #endif #ifdef __mips__ #define CPUINFO_PROC {"cpu model"} #endif #ifdef __arc__ #define CPUINFO_PROC {"Processor"} #endif #ifdef __xtensa__ #define CPUINFO_PROC {"core ID"} #endif #ifndef CPUINFO_PROC #define CPUINFO_PROC { "model name", } #endif static inline int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) { int fd; fd = syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags); #ifdef HAVE_ATTR_TEST if (unlikely(test_attr__enabled)) test_attr__open(attr, pid, cpu, fd, group_fd, flags); #endif return fd; } #endif /* _PERF_SYS_H */ xdp-tools-1.6.1/headers/xdp/000077500000000000000000000000001514310632100156525ustar00rootroot00000000000000xdp-tools-1.6.1/headers/xdp/libxdp.h000066400000000000000000000137261514310632100173160ustar00rootroot00000000000000// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) /* * XDP management utility functions * * Copyright (C) 2020 Toke Høiland-Jørgensen */ #ifndef __LIBXDP_LIBXDP_H #define __LIBXDP_LIBXDP_H #include #include #include #include #include "xdp_helpers.h" #ifdef __cplusplus extern "C" { #endif #define XDP_BPFFS_ENVVAR "LIBXDP_BPFFS" #define XDP_BPFFS_MOUNT_ENVVAR "LIBXDP_BPFFS_AUTOMOUNT" #define XDP_OBJECT_ENVVAR "LIBXDP_OBJECT_PATH" #define XDP_ATTACH_DEVBIND (1 << 0) #define XDP_ATTACH_FLAGS (XDP_ATTACH_DEVBIND) enum xdp_attach_mode { XDP_MODE_UNSPEC = 0, XDP_MODE_NATIVE, XDP_MODE_SKB, XDP_MODE_HW }; /* This is compatible with libbpf logging levels */ enum libxdp_print_level { LIBXDP_WARN, LIBXDP_INFO, LIBXDP_DEBUG, }; typedef int (*libxdp_print_fn_t)(enum libxdp_print_level level, const char *, va_list ap); libxdp_print_fn_t libxdp_set_print(libxdp_print_fn_t fn); struct xdp_program; struct xdp_multiprog; long libxdp_get_error(const void *ptr); int libxdp_strerror(int err, char *buf, size_t size); int libxdp_clean_references(int ifindex); struct xdp_program *xdp_program__from_bpf_obj(struct bpf_object *obj, const char *section_name); struct xdp_program *xdp_program__find_file(const char *filename, const char *section_name, struct bpf_object_open_opts *opts); struct xdp_program *xdp_program__open_file(const char *filename, const char *section_name, struct bpf_object_open_opts *opts); struct xdp_program *xdp_program__from_fd(int fd); struct xdp_program *xdp_program__from_id(__u32 prog_id); struct xdp_program *xdp_program__from_pin(const char *pin_path); struct xdp_program *xdp_program__clone(struct xdp_program *xdp_prog, unsigned int flags); void xdp_program__close(struct xdp_program *xdp_prog); int xdp_program__test_run(struct xdp_program *xdp_prog, struct bpf_test_run_opts *opts, unsigned int flags); enum xdp_attach_mode xdp_program__is_attached(const struct xdp_program *xdp_prog, int ifindex); const char *xdp_program__name(const struct xdp_program *xdp_prog); const unsigned char *xdp_program__tag(const struct xdp_program *xdp_prog); struct bpf_object *xdp_program__bpf_obj(struct xdp_program *xdp_prog); const struct btf *xdp_program__btf(struct xdp_program *xdp_prog); uint32_t xdp_program__id(const struct xdp_program *xdp_prog); int xdp_program__fd(const struct xdp_program *xdp_prog); unsigned int xdp_program__run_prio(const struct xdp_program *xdp_prog); int xdp_program__set_run_prio(struct xdp_program *xdp_prog, unsigned int run_prio); bool xdp_program__chain_call_enabled(const struct xdp_program *xdp_prog, enum xdp_action action); int xdp_program__set_chain_call_enabled(struct xdp_program *prog, unsigned int action, bool enabled); int xdp_program__print_chain_call_actions(const struct xdp_program *prog, char *buf, size_t buf_len); bool xdp_program__xdp_frags_support(const struct xdp_program *prog); int xdp_program__set_xdp_frags_support(struct xdp_program *prog, bool frags); int xdp_program__pin(struct xdp_program *xdp_prog, const char *pin_path); int xdp_program__attach(struct xdp_program *xdp_prog, int ifindex, enum xdp_attach_mode mode, unsigned int flags); int xdp_program__attach_multi(struct xdp_program **progs, size_t num_progs, int ifindex, enum xdp_attach_mode mode, unsigned int flags); int xdp_program__detach(struct xdp_program *xdp_prog, int ifindex, enum xdp_attach_mode mode, unsigned int flags); int xdp_program__detach_multi(struct xdp_program **progs, size_t num_progs, int ifindex, enum xdp_attach_mode mode, unsigned int flags); struct xdp_multiprog *xdp_multiprog__get_from_ifindex(int ifindex); struct xdp_program *xdp_multiprog__next_prog(const struct xdp_program *prog, const struct xdp_multiprog *mp); void xdp_multiprog__close(struct xdp_multiprog *mp); int xdp_multiprog__detach(struct xdp_multiprog *mp); enum xdp_attach_mode xdp_multiprog__attach_mode(const struct xdp_multiprog *mp); struct xdp_program *xdp_multiprog__main_prog(const struct xdp_multiprog *mp); struct xdp_program *xdp_multiprog__hw_prog(const struct xdp_multiprog *mp); bool xdp_multiprog__is_legacy(const struct xdp_multiprog *mp); int xdp_multiprog__program_count(const struct xdp_multiprog *mp); bool xdp_multiprog__xdp_frags_support(const struct xdp_multiprog *mp); bool xdp_multiprog__xdp_dev_bound(const struct xdp_multiprog *mp); /* Only following members can be set at once: * * @obj, @prog_name * Create using BPF program with name @prog_name in BPF object @obj * * @prog_name is optional. In absence of @prog_name, first program of BPF * object is picked. * * @find_filename, @prog_name, @opts * Create using BPF program with name @prog_name in BPF object located in * LIBXDP_OBJECT_PATH with filename @find_filename, using * bpf_object_open_opts @opts. * * @prog_name and @opts is optional. In absence of @prog_name, first * program of BPF object is picked. * * @open_filename, @prog_name, @opts * Create using BPF program with name @prog_name in BPF object located at * path @open_filename, using bpf_object_open_opts @opts. * * @prog_name and @opts is optional. In absence of @prog_name, first * program of BPF object is picked. * * @id * Load from BPF program with ID @id * * @fd * Load from BPF program with fd @fd * * When one of these combinations is set, all other members of the opts struct * must be zeroed out. */ struct xdp_program_opts { size_t sz; struct bpf_object *obj; struct bpf_object_open_opts *opts; const char *prog_name; const char *find_filename; const char *open_filename; const char *pin_path; __u32 id; int fd; size_t :0; }; #define xdp_program_opts__last_field fd #define DECLARE_LIBXDP_OPTS DECLARE_LIBBPF_OPTS struct xdp_program *xdp_program__create(struct xdp_program_opts *opts); #ifdef __cplusplus } /* extern "C" */ #endif #endif xdp-tools-1.6.1/headers/xdp/parsing_helpers.h000066400000000000000000000160431514310632100212140ustar00rootroot00000000000000/* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-clause) */ /* * This file contains parsing functions that can be used in eXDP programs. The * functions are marked as __always_inline, and fully defined in this header * file to be included in the BPF program. * * Each helper parses a packet header, including doing bounds checking, and * returns the type of its contents if successful, and -1 otherwise. * * For Ethernet and IP headers, the content type is the type of the payload * (h_proto for Ethernet, nexthdr for IPv6), for ICMP it is the ICMP type field. * All return values are in host byte order. */ #ifndef __PARSING_HELPERS_H #define __PARSING_HELPERS_H #include #include #include #include #include #include #include #include #include #include /* Header cursor to keep track of current parsing position */ struct hdr_cursor { void *pos; }; /* * struct vlan_hdr - vlan header * @h_vlan_TCI: priority and VLAN ID * @h_vlan_encapsulated_proto: packet type ID or len */ struct vlan_hdr { __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; }; /* * Struct icmphdr_common represents the common part of the icmphdr and icmp6hdr * structures. */ struct icmphdr_common { __u8 type; __u8 code; __sum16 cksum; }; #define ARPHRD_ETHER 1 /* Ethernet 10Mbps */ #define ARPOP_REQUEST 1 /* ARP request */ #define ARPOP_REPLY 2 /* ARP reply */ struct arphdr { __be16 ar_hrd; /* format of hardware address */ __be16 ar_pro; /* format of protocol address */ unsigned char ar_hln; /* length of hardware address */ unsigned char ar_pln; /* length of protocol address */ __be16 ar_op; /* ARP opcode (command) */ /* * Ethernet looks like this : This bit is variable sized however... */ unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */ __be32 ar_sip; /* sender IP address */ unsigned char ar_tha[ETH_ALEN]; /* target hardware address */ __be32 ar_tip; /* target IP address */ } __attribute__((packed)); /* Allow users of header file to redefine VLAN max depth */ #ifndef VLAN_MAX_DEPTH #define VLAN_MAX_DEPTH 4 #endif /* Longest chain of IPv6 extension headers to resolve */ #ifndef IPV6_EXT_MAX_CHAIN #define IPV6_EXT_MAX_CHAIN 6 #endif static __always_inline int proto_is_vlan(__u16 h_proto) { return !!(h_proto == bpf_htons(ETH_P_8021Q) || h_proto == bpf_htons(ETH_P_8021AD)); } /* Notice, parse_ethhdr() will skip VLAN tags, by advancing nh->pos and returns * next header EtherType, BUT the ethhdr pointer supplied still points to the * Ethernet header. Thus, caller can look at eth->h_proto to see if this was a * VLAN tagged packet. */ static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end, struct ethhdr **ethhdr) { struct ethhdr *eth = nh->pos; struct vlan_hdr *vlh; __u16 h_proto; int i; if (eth + 1 > data_end) return -1; nh->pos = eth + 1; *ethhdr = eth; vlh = nh->pos; h_proto = eth->h_proto; /* Use loop unrolling to avoid the verifier restriction on loops; * support up to VLAN_MAX_DEPTH layers of VLAN encapsulation. */ #pragma unroll for (i = 0; i < VLAN_MAX_DEPTH; i++) { if (!proto_is_vlan(h_proto)) break; if (vlh + 1 > data_end) break; h_proto = vlh->h_vlan_encapsulated_proto; vlh++; } nh->pos = vlh; return h_proto; /* network-byte-order */ } static __always_inline int skip_ip6hdrext(struct hdr_cursor *nh, void *data_end, __u8 next_hdr_type) { for (int i = 0; i < IPV6_EXT_MAX_CHAIN; ++i) { struct ipv6_opt_hdr *hdr = nh->pos; if (hdr + 1 > data_end) return -1; switch (next_hdr_type) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_MH: nh->pos = (char *)hdr + (hdr->hdrlen + 1) * 8; next_hdr_type = hdr->nexthdr; break; case IPPROTO_AH: nh->pos = (char *)hdr + (hdr->hdrlen + 2) * 4; next_hdr_type = hdr->nexthdr; break; case IPPROTO_FRAGMENT: nh->pos = (char *)hdr + 8; next_hdr_type = hdr->nexthdr; break; default: /* Found a header that is not an IPv6 extension header */ return next_hdr_type; } } return -1; } static __always_inline int parse_ip6hdr(struct hdr_cursor *nh, void *data_end, struct ipv6hdr **ip6hdr) { struct ipv6hdr *ip6h = nh->pos; /* Pointer-arithmetic bounds check; pointer +1 points to after end of * thing being pointed to. We will be using this style in the remainder * of the tutorial. */ if (ip6h + 1 > data_end) return -1; nh->pos = ip6h + 1; *ip6hdr = ip6h; return skip_ip6hdrext(nh, data_end, ip6h->nexthdr); } static __always_inline int parse_iphdr(struct hdr_cursor *nh, void *data_end, struct iphdr **iphdr) { struct iphdr *iph = nh->pos; int hdrsize; if (iph + 1 > data_end) return -1; hdrsize = iph->ihl * 4; /* Variable-length IPv4 header, need to use byte-based arithmetic */ if (nh->pos + hdrsize > data_end) return -1; nh->pos += hdrsize; *iphdr = iph; return iph->protocol; } static __always_inline int parse_arphdr(struct hdr_cursor *nh, void *data_end, struct arphdr **arp_hdr) { struct arphdr *arp = nh->pos; if (arp + 1 > data_end) return -1; if (arp->ar_hrd != bpf_htons(ARPHRD_ETHER) || arp->ar_pro != bpf_htons(ETH_P_IP) || arp->ar_hln != ETH_ALEN || arp->ar_pln != 4) return -1; nh->pos = (void *)(arp + 1); *arp_hdr = arp; return arp->ar_op; } static __always_inline int parse_icmp6hdr(struct hdr_cursor *nh, void *data_end, struct icmp6hdr **icmp6hdr) { struct icmp6hdr *icmp6h = nh->pos; if (icmp6h + 1 > data_end) return -1; nh->pos = icmp6h + 1; *icmp6hdr = icmp6h; return icmp6h->icmp6_type; } static __always_inline int parse_icmphdr(struct hdr_cursor *nh, void *data_end, struct icmphdr **icmphdr) { struct icmphdr *icmph = nh->pos; if (icmph + 1 > data_end) return -1; nh->pos = icmph + 1; *icmphdr = icmph; return icmph->type; } static __always_inline int parse_icmphdr_common(struct hdr_cursor *nh, void *data_end, struct icmphdr_common **icmphdr) { struct icmphdr_common *h = nh->pos; if (h + 1 > data_end) return -1; nh->pos = h + 1; *icmphdr = h; return h->type; } /* * parse_udphdr: parse the udp header and return the length of the udp payload */ static __always_inline int parse_udphdr(struct hdr_cursor *nh, void *data_end, struct udphdr **udphdr) { int len; struct udphdr *h = nh->pos; if (h + 1 > data_end) return -1; nh->pos = h + 1; *udphdr = h; len = bpf_ntohs(h->len) - sizeof(struct udphdr); if (len < 0) return -1; return len; } /* * parse_tcphdr: parse and return the length of the tcp header */ static __always_inline int parse_tcphdr(struct hdr_cursor *nh, void *data_end, struct tcphdr **tcphdr) { int len; struct tcphdr *h = nh->pos; if (h + 1 > data_end) return -1; len = h->doff * 4; if ((void *) h + len > data_end) return -1; nh->pos = h + 1; *tcphdr = h; return len; } #endif /* __PARSING_HELPERS_H */ xdp-tools-1.6.1/headers/xdp/prog_dispatcher.h000066400000000000000000000023411514310632100212000ustar00rootroot00000000000000/* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-clause) */ #ifndef __PROG_DISPATCHER_H #define __PROG_DISPATCHER_H #include #define XDP_METADATA_SECTION "xdp_metadata" #define XDP_DISPATCHER_VERSION 3 /* magic byte is 'X' + 'D' + 'P' (88+68+80=236) */ #define XDP_DISPATCHER_MAGIC 236 /* default retval for dispatcher corresponds to the highest bit in the * chain_call_actions bitmap; we use this to make sure the dispatcher always * continues the calls chain if a function does not have an freplace program * attached. */ #define XDP_DISPATCHER_RETVAL 31 #ifndef MAX_DISPATCHER_ACTIONS #define MAX_DISPATCHER_ACTIONS 10 #endif struct xdp_dispatcher_config { __u8 magic; /* Set to XDP_DISPATCHER_MAGIC */ __u8 dispatcher_version; /* Set to XDP_DISPATCHER_VERSION */ __u8 num_progs_enabled; /* Number of active program slots */ __u8 is_xdp_frags; /* Whether this dispatcher is loaded with XDP frags support */ __u32 chain_call_actions[MAX_DISPATCHER_ACTIONS]; __u32 run_prios[MAX_DISPATCHER_ACTIONS]; __u32 program_flags[MAX_DISPATCHER_ACTIONS]; __u8 is_xdp_devbound; /* Whether this dispatcher is bounded to a device */ }; #endif xdp-tools-1.6.1/headers/xdp/xdp_helpers.h000066400000000000000000000004541514310632100203430ustar00rootroot00000000000000/* SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-clause) */ #ifndef __XDP_HELPERS_H #define __XDP_HELPERS_H #define _CONCAT(x,y) x ## y #define XDP_RUN_CONFIG(f) _CONCAT(_,f) SEC(".xdp_run_config") #define XDP_DEFAULT_RUN_PRIO 50 #define XDP_DEFAULT_CHAIN_CALL_ACTIONS (1< #include #include #include #include #include "xdp_sample_shared.h" #define ETH_ALEN 6 #define ETH_P_802_3_MIN 0x0600 #define ETH_P_8021Q 0x8100 #define ETH_P_8021AD 0x88A8 #define ETH_P_IP 0x0800 #define ETH_P_IPV6 0x86DD #define ETH_P_ARP 0x0806 #define IPPROTO_ICMPV6 58 #define EINVAL 22 #define ENETDOWN 100 #define EMSGSIZE 90 #define EOPNOTSUPP 95 #define ENOSPC 28 typedef struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(map_flags, BPF_F_MMAPABLE); __type(key, unsigned int); __type(value, struct datarec); } array_map; extern array_map rx_cnt; extern const volatile int nr_cpus; enum { XDP_REDIRECT_SUCCESS = 0, XDP_REDIRECT_ERROR = 1 }; static __always_inline void swap_src_dst_mac(void *data) { unsigned short *p = data; unsigned short dst[3]; dst[0] = p[0]; dst[1] = p[1]; dst[2] = p[2]; p[0] = p[3]; p[1] = p[4]; p[2] = p[5]; p[3] = dst[0]; p[4] = dst[1]; p[5] = dst[2]; } /* * Note: including linux/compiler.h or linux/kernel.h for the macros below * conflicts with vmlinux.h include in BPF files, so we define them here. * * Following functions are taken from kernel sources and * break aliasing rules in their original form. * * While kernel is compiled with -fno-strict-aliasing, * perf uses -Wstrict-aliasing=3 which makes build fail * under gcc 4.4. * * Using extra __may_alias__ type to allow aliasing * in this case. */ typedef __u8 __attribute__((__may_alias__)) __u8_alias_t; typedef __u16 __attribute__((__may_alias__)) __u16_alias_t; typedef __u32 __attribute__((__may_alias__)) __u32_alias_t; typedef __u64 __attribute__((__may_alias__)) __u64_alias_t; static __always_inline void __read_once_size(const volatile void *p, void *res, int size) { switch (size) { case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break; case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break; case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break; case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break; default: asm volatile ("" : : : "memory"); __builtin_memcpy((void *)res, (const void *)p, size); asm volatile ("" : : : "memory"); } } static __always_inline void __write_once_size(volatile void *p, void *res, int size) { switch (size) { case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break; case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break; case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break; case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break; default: asm volatile ("" : : : "memory"); __builtin_memcpy((void *)p, (const void *)res, size); asm volatile ("" : : : "memory"); } } #define READ_ONCE(x) \ ({ \ union { typeof(x) __val; char __c[1]; } __u = \ { .__c = { 0 } }; \ __read_once_size(&(x), __u.__c, sizeof(x)); \ __u.__val; \ }) #define WRITE_ONCE(x, val) \ ({ \ union { typeof(x) __val; char __c[1]; } __u = \ { .__val = (val) }; \ __write_once_size(&(x), __u.__c, sizeof(x)); \ __u.__val; \ }) /* Add a value using relaxed read and relaxed write. Less expensive than * fetch_add when there is no write concurrency. */ #define NO_TEAR_ADD(x, val) WRITE_ONCE((x), READ_ONCE(x) + (val)) #define NO_TEAR_INC(x) NO_TEAR_ADD((x), 1) #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif xdp-tools-1.6.1/headers/xdp/xdp_sample_common.bpf.h000066400000000000000000000164661514310632100223120ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 /* GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ #ifndef _XDP_SAMPLE_COMMON_BPF_H #define _XDP_SAMPLE_COMMON_BPF_H #include "xdp_sample.bpf.h" #include #include #include #include #include array_map rx_cnt SEC(".maps"); array_map redir_err_cnt SEC(".maps"); array_map cpumap_enqueue_cnt SEC(".maps"); array_map cpumap_kthread_cnt SEC(".maps"); array_map exception_cnt SEC(".maps"); array_map devmap_xmit_cnt SEC(".maps"); array_map rxq_cnt SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); __uint(max_entries, 32 * 32); __type(key, __u64); __type(value, struct datarec); } devmap_xmit_cnt_multi SEC(".maps"); const volatile int nr_cpus = 0; /* These can be set before loading so that redundant comparisons can be DCE'd by * the verifier, and only actual matches are tried after loading tp_btf program. * This allows sample to filter tracepoint stats based on net_device. */ const volatile int from_match[32] = {}; const volatile int to_match[32] = {}; int cpumap_map_id = 0; /* Find if b is part of set a, but if a is empty set then evaluate to true */ #define IN_SET(a, b) \ ({ \ bool __res = !(a)[0]; \ for (int i = 0; i < ARRAY_SIZE(a) && (a)[i]; i++) { \ __res = (a)[i] == (b); \ if (__res) \ break; \ } \ __res; \ }) static __always_inline __u32 xdp_get_err_key(int err) { switch (err) { case 0: return 0; case -EINVAL: return 2; case -ENETDOWN: return 3; case -EMSGSIZE: return 4; case -EOPNOTSUPP: return 5; case -ENOSPC: return 6; default: return 1; } } static __always_inline int xdp_redirect_collect_stat(int from, int err) { __u32 cpu = bpf_get_smp_processor_id(); __u32 key = XDP_REDIRECT_ERROR; struct datarec *rec; __u32 idx; if (!IN_SET(from_match, from)) return 0; key = xdp_get_err_key(err); idx = key * nr_cpus + cpu; rec = bpf_map_lookup_elem(&redir_err_cnt, &idx); if (!rec) return 0; if (key) NO_TEAR_INC(rec->dropped); else NO_TEAR_INC(rec->processed); return 0; /* Indicate event was filtered (no further processing)*/ /* * Returning 1 here would allow e.g. a perf-record tracepoint * to see and record these events, but it doesn't work well * in-practice as stopping perf-record also unload this * bpf_prog. Plus, there is additional overhead of doing so. */ } SEC("tp_btf/xdp_redirect_err") int BPF_PROG(tp_xdp_redirect_err, const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, const struct bpf_map *map, __u32 index) { return xdp_redirect_collect_stat(dev->ifindex, err); } SEC("tp_btf/xdp_redirect") int BPF_PROG(tp_xdp_redirect, const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, const struct bpf_map *map, __u32 index) { return xdp_redirect_collect_stat(dev->ifindex, err); } SEC("tp_btf/xdp_cpumap_enqueue") int BPF_PROG(tp_xdp_cpumap_enqueue, int map_id, unsigned int processed, unsigned int drops, int to_cpu) { __u32 cpu = bpf_get_smp_processor_id(); struct datarec *rec; __u32 idx; if (cpumap_map_id && cpumap_map_id != map_id) return 0; idx = to_cpu * nr_cpus + cpu; rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &idx); if (!rec) return 0; NO_TEAR_ADD(rec->processed, processed); NO_TEAR_ADD(rec->dropped, drops); /* Record bulk events, then userspace can calc average bulk size */ if (processed > 0) NO_TEAR_INC(rec->issue); /* Inception: It's possible to detect overload situations, via * this tracepoint. This can be used for creating a feedback * loop to XDP, which can take appropriate actions to mitigate * this overload situation. */ return 0; } SEC("tp_btf/xdp_cpumap_kthread") int BPF_PROG(tp_xdp_cpumap_kthread, int map_id, unsigned int processed, unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats) { struct datarec *rec; __u32 cpu; if (cpumap_map_id && cpumap_map_id != map_id) return 0; cpu = bpf_get_smp_processor_id(); rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &cpu); if (!rec) return 0; NO_TEAR_ADD(rec->processed, processed); NO_TEAR_ADD(rec->dropped, drops); NO_TEAR_ADD(rec->xdp_pass, xdp_stats->pass); NO_TEAR_ADD(rec->xdp_drop, xdp_stats->drop); NO_TEAR_ADD(rec->xdp_redirect, xdp_stats->redirect); /* Count times kthread yielded CPU via schedule call */ if (sched) NO_TEAR_INC(rec->issue); return 0; } SEC("tp_btf/xdp_cpumap_kthread") int BPF_PROG(tp_xdp_cpumap_compat, int map_id, unsigned int processed, unsigned int drops, int sched) { struct datarec *rec; __u32 cpu; if (cpumap_map_id && cpumap_map_id != map_id) return 0; cpu = bpf_get_smp_processor_id(); rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &cpu); if (!rec) return 0; NO_TEAR_ADD(rec->processed, processed); NO_TEAR_ADD(rec->dropped, drops); /* Count times kthread yielded CPU via schedule call */ if (sched) NO_TEAR_INC(rec->issue); return 0; } SEC("tp_btf/xdp_exception") int BPF_PROG(tp_xdp_exception, const struct net_device *dev, const struct bpf_prog *xdp, __u32 act) { __u32 cpu = bpf_get_smp_processor_id(); struct datarec *rec; __u32 key = act, idx; if (!IN_SET(from_match, dev->ifindex)) return 0; if (!IN_SET(to_match, dev->ifindex)) return 0; if (key > XDP_REDIRECT) key = XDP_REDIRECT + 1; idx = key * nr_cpus + cpu; rec = bpf_map_lookup_elem(&exception_cnt, &idx); if (!rec) return 0; NO_TEAR_INC(rec->dropped); return 0; } SEC("tp_btf/xdp_devmap_xmit") int BPF_PROG(tp_xdp_devmap_xmit, const struct net_device *from_dev, const struct net_device *to_dev, int sent, int drops, int err) { struct datarec *rec; int idx_in, idx_out; __u32 cpu; idx_in = from_dev->ifindex; idx_out = to_dev->ifindex; if (!IN_SET(from_match, idx_in)) return 0; if (!IN_SET(to_match, idx_out)) return 0; cpu = bpf_get_smp_processor_id(); rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &cpu); if (!rec) return 0; NO_TEAR_ADD(rec->processed, sent); NO_TEAR_ADD(rec->dropped, drops); /* Record bulk events, then userspace can calc average bulk size */ NO_TEAR_INC(rec->info); /* Record error cases, where no frame were sent */ /* Catch API error of drv ndo_xdp_xmit sent more than count */ if (err || drops < 0) NO_TEAR_INC(rec->issue); return 0; } SEC("tp_btf/xdp_devmap_xmit") int BPF_PROG(tp_xdp_devmap_xmit_multi, const struct net_device *from_dev, const struct net_device *to_dev, int sent, int drops, int err) { struct datarec empty = {}; struct datarec *rec; int idx_in, idx_out; __u64 idx; idx_in = from_dev->ifindex; idx_out = to_dev->ifindex; idx = idx_in; idx = idx << 32 | idx_out; if (!IN_SET(from_match, idx_in)) return 0; if (!IN_SET(to_match, idx_out)) return 0; bpf_map_update_elem(&devmap_xmit_cnt_multi, &idx, &empty, BPF_NOEXIST); rec = bpf_map_lookup_elem(&devmap_xmit_cnt_multi, &idx); if (!rec) return 0; NO_TEAR_ADD(rec->processed, sent); NO_TEAR_ADD(rec->dropped, drops); NO_TEAR_INC(rec->info); if (err || drops < 0) NO_TEAR_INC(rec->issue); return 0; } #endif xdp-tools-1.6.1/headers/xdp/xdp_sample_shared.h000066400000000000000000000005731514310632100215120ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0-only #ifndef _XDP_SAMPLE_SHARED_H #define _XDP_SAMPLE_SHARED_H struct datarec { unsigned long long processed; unsigned long long dropped; unsigned long long issue; union { unsigned long long xdp_pass; unsigned long long info; }; unsigned long long xdp_drop; unsigned long long xdp_redirect; } __attribute__((aligned(64))); #endif xdp-tools-1.6.1/headers/xdp/xdp_stats_kern.h000066400000000000000000000026131514310632100210550ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ /* Used *ONLY* by BPF-prog running kernel side. */ #ifndef __XDP_STATS_KERN_H #define __XDP_STATS_KERN_H /* Data record type 'struct datarec' is defined in common/xdp_stats_kern_user.h, * programs using this header must first include that file. */ #ifndef __XDP_STATS_KERN_USER_H #warning "You forgot to #include <../common/xdp_stats_kern_user.h>" #include <../common/xdp_stats_kern_user.h> #endif #ifndef XDP_STATS_MAP_PINNING #define XDP_STATS_MAP_PINNING LIBBPF_PIN_BY_NAME #endif /* Keeps stats per (enum) xdp_action */ struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __uint(max_entries, XDP_ACTION_MAX); __type(key, __u32); __type(value, struct xdp_stats_record); __uint(pinning, LIBBPF_PIN_BY_NAME); } XDP_STATS_MAP_NAME SEC(".maps"); static __always_inline __u32 xdp_stats_record_action(struct xdp_md *ctx, __u32 action) { if (action >= XDP_ACTION_MAX) return XDP_ABORTED; /* Lookup in kernel BPF-side return pointer to actual data record */ struct xdp_stats_record *rec = bpf_map_lookup_elem(&xdp_stats_map, &action); if (!rec) return XDP_ABORTED; /* BPF_MAP_TYPE_PERCPU_ARRAY returns a data record specific to current * CPU and XDP hooks runs under Softirq, which makes it safe to update * without atomic operations. */ rec->rx_packets++; rec->rx_bytes += (ctx->data_end - ctx->data); return action; } #endif /* __XDP_STATS_KERN_H */ xdp-tools-1.6.1/headers/xdp/xdp_stats_kern_user.h000066400000000000000000000010511514310632100221060ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ /* Used by BPF-prog kernel side BPF-progs and userspace programs, * for sharing xdp_stats common struct and DEFINEs. */ #ifndef __XDP_STATS_KERN_USER_H #define __XDP_STATS_KERN_USER_H /* This is the data record stored in the map */ struct xdp_stats_record { union { __u64 packets; __u64 rx_packets; }; union { __u64 bytes; __u64 rx_bytes; }; }; #ifndef XDP_ACTION_MAX #define XDP_ACTION_MAX (XDP_REDIRECT + 1) #endif #define XDP_STATS_MAP_NAME xdp_stats_map #endif /* __XDP_STATS_KERN_USER_H */ xdp-tools-1.6.1/headers/xdp/xsk.h000066400000000000000000000222761514310632100166410ustar00rootroot00000000000000/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ /* * AF_XDP user-space access library. * * Copyright(c) 2018 - 2021 Intel Corporation. * * Author(s): Magnus Karlsson */ /* So as not to clash with these functions when they where part of libbpf */ #ifndef __LIBBPF_XSK_H #define __LIBBPF_XSK_H #include #include #include #include #ifdef __cplusplus extern "C" { #endif #ifdef __GNUC_STDC_INLINE__ #define XDP_ALWAYS_INLINE inline __attribute__((__always_inline__)) #elif __GNUC_GNU_INLINE__ #define XDP_ALWAYS_INLINE static inline __attribute__((__always_inline__)) #else #define XDP_ALWAYS_INLINE static inline #endif /* Do not access these members directly. Use the functions below. */ #define DEFINE_XSK_RING(name) \ struct name { \ __u32 cached_prod; \ __u32 cached_cons; \ __u32 mask; \ __u32 size; \ __u32 *producer; \ __u32 *consumer; \ void *ring; \ __u32 *flags; \ } DEFINE_XSK_RING(xsk_ring_prod); DEFINE_XSK_RING(xsk_ring_cons); /* For a detailed explanation on the memory barriers associated with the * ring, please take a look at net/xdp/xsk_queue.h in the Linux kernel source tree. */ struct xsk_umem; struct xsk_socket; XDP_ALWAYS_INLINE __u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill, __u32 idx) { __u64 *addrs = (__u64 *)fill->ring; return &addrs[idx & fill->mask]; } XDP_ALWAYS_INLINE const __u64 * xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx) { const __u64 *addrs = (const __u64 *)comp->ring; return &addrs[idx & comp->mask]; } XDP_ALWAYS_INLINE struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx, __u32 idx) { struct xdp_desc *descs = (struct xdp_desc *)tx->ring; return &descs[idx & tx->mask]; } XDP_ALWAYS_INLINE const struct xdp_desc * xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx) { const struct xdp_desc *descs = (const struct xdp_desc *)rx->ring; return &descs[idx & rx->mask]; } XDP_ALWAYS_INLINE int xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r) { return *r->flags & XDP_RING_NEED_WAKEUP; } XDP_ALWAYS_INLINE __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb) { __u32 free_entries = r->cached_cons - r->cached_prod; if (free_entries >= nb) return free_entries; /* Refresh the local tail pointer. * cached_cons is r->size bigger than the real consumer pointer so * that this addition can be avoided in the more frequently * executed code that computs free_entries in the beginning of * this function. Without this optimization it whould have been * free_entries = r->cached_cons - r->cached_prod + r->size */ r->cached_cons = __atomic_load_n(r->consumer, __ATOMIC_ACQUIRE); r->cached_cons += r->size; return r->cached_cons - r->cached_prod; } XDP_ALWAYS_INLINE __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb) { __u32 entries = r->cached_prod - r->cached_cons; if (entries == 0) { r->cached_prod = __atomic_load_n(r->producer, __ATOMIC_ACQUIRE); entries = r->cached_prod - r->cached_cons; } return (entries > nb) ? nb : entries; } XDP_ALWAYS_INLINE __u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, __u32 *idx) { if (xsk_prod_nb_free(prod, nb) < nb) return 0; *idx = prod->cached_prod; prod->cached_prod += nb; return nb; } XDP_ALWAYS_INLINE void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb) { /* Make sure everything has been written to the ring before indicating * this to the kernel by writing the producer pointer. */ __atomic_store_n(prod->producer, *prod->producer + nb, __ATOMIC_RELEASE); } XDP_ALWAYS_INLINE __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx) { __u32 entries = xsk_cons_nb_avail(cons, nb); if (entries > 0) { *idx = cons->cached_cons; cons->cached_cons += entries; } return entries; } XDP_ALWAYS_INLINE void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, __u32 nb) { cons->cached_cons -= nb; } XDP_ALWAYS_INLINE void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb) { /* Make sure data has been read before indicating we are done * with the entries by updating the consumer pointer. */ __atomic_store_n(cons->consumer, *cons->consumer + nb, __ATOMIC_RELEASE); } XDP_ALWAYS_INLINE void *xsk_umem__get_data(void *umem_area, __u64 addr) { return &((char *)umem_area)[addr]; } XDP_ALWAYS_INLINE __u64 xsk_umem__extract_addr(__u64 addr) { return addr & XSK_UNALIGNED_BUF_ADDR_MASK; } XDP_ALWAYS_INLINE __u64 xsk_umem__extract_offset(__u64 addr) { return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; } XDP_ALWAYS_INLINE __u64 xsk_umem__add_offset_to_addr(__u64 addr) { return xsk_umem__extract_addr(addr) + xsk_umem__extract_offset(addr); } int xsk_umem__fd(const struct xsk_umem *umem); int xsk_socket__fd(const struct xsk_socket *xsk); #define XSK_RING_CONS__DEFAULT_NUM_DESCS 2048 #define XSK_RING_PROD__DEFAULT_NUM_DESCS 2048 #define XSK_UMEM__DEFAULT_FRAME_SHIFT 12 /* 4096 bytes */ #define XSK_UMEM__DEFAULT_FRAME_SIZE (1 << XSK_UMEM__DEFAULT_FRAME_SHIFT) #define XSK_UMEM__DEFAULT_FRAME_HEADROOM 0 #define XSK_UMEM__DEFAULT_FLAGS 0 #define XSK_UMEM__DEFAULT_TX_METADATA_LEN 0 struct xsk_umem_config { __u32 fill_size; __u32 comp_size; __u32 frame_size; __u32 frame_headroom; __u32 flags; }; /* The following fields are optional: * * @fd, @size, @fill_size, @comp_size, @frame_size, @frame_headroom, * @flags, @tx_metadata_len * If @fd is unset, a new sockfd will be created. * If @size is unset, @umem_area must be page-aligned. * If the remaining fields are unset, they will be set to * default value (see `xsk_set_umem_config()`). * * Except for the fields mentioned above, no field can be set. */ struct xsk_umem_opts { size_t sz; int fd; __u64 size; __u32 fill_size; __u32 comp_size; __u32 frame_size; __u32 frame_headroom; __u32 flags; __u32 tx_metadata_len; size_t :0; }; #define xsk_umem_opts__last_field tx_metadata_len int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd); int xsk_socket__update_xskmap(struct xsk_socket *xsk, int xsks_map_fd); /* Flags for the libbpf_flags field. * We still call this field libbpf_flags for compatibility reasons. */ #define XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD (1 << 0) #define XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD (1 << 0) struct xsk_socket_config { __u32 rx_size; __u32 tx_size; union { __u32 libbpf_flags; __u32 libxdp_flags; }; __u32 xdp_flags; __u16 bind_flags; }; /* * The following fields should not be NULL at the same time: * * @rx, @tx * At least one traffic direction should be assigned for an xsk. * * The following fields are optional: * * @fill, @comp, @rx_size, @tx_size, @libxdp_flags, @xdp_flags, * @bind_flags * If @fill and @comp are both unset, they will be set to umem's * fill_save and comp_save respectively. Note that it is invalid * to set only one of them. * If the remaining fields are unset, they will be set to * default value (see `xsk_set_xdp_socket_config()`). * * Except for the fields mentioned above, no field can be set. */ struct xsk_socket_opts { size_t sz; struct xsk_ring_cons *rx; struct xsk_ring_prod *tx; struct xsk_ring_prod *fill; struct xsk_ring_cons *comp; __u32 rx_size; __u32 tx_size; __u32 libxdp_flags; __u32 xdp_flags; __u16 bind_flags; size_t :0; }; #define xsk_socket_opts__last_field bind_flags /* Set config to NULL to get the default configuration. */ int xsk_umem__create(struct xsk_umem **umem, void *umem_area, __u64 size, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, const struct xsk_umem_config *config); int xsk_umem__create_with_fd(struct xsk_umem **umem, int fd, void *umem_area, __u64 size, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, const struct xsk_umem_config *config); /* Newer version to create umem by opts, recommended to use. */ struct xsk_umem *xsk_umem__create_opts(void *umem_area, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, struct xsk_umem_opts *opts); int xsk_socket__create(struct xsk_socket **xsk, const char *ifname, __u32 queue_id, struct xsk_umem *umem, struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, const struct xsk_socket_config *config); int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, const char *ifname, __u32 queue_id, struct xsk_umem *umem, struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, const struct xsk_socket_config *config); /* Newer version to create xsk by opts, recommended to use. */ struct xsk_socket *xsk_socket__create_opts(const char *ifname, __u32 queue_id, struct xsk_umem *umem, struct xsk_socket_opts *opts); /* Returns 0 for success and -EBUSY if the umem is still in use. */ int xsk_umem__delete(struct xsk_umem *umem); void xsk_socket__delete(struct xsk_socket *xsk); #ifdef __cplusplus } /* extern "C" */ #endif #endif /* __LIBBPF_XSK_H */ /* For new functions post libbpf */ #ifndef __LIBXDP_XSK_H #define __LIBXDP_XSK_H #ifdef __cplusplus extern "C" { #endif #ifdef __cplusplus } /* extern "C" */ #endif #endif /* __LIBXDP_XSK_H */ xdp-tools-1.6.1/lib/000077500000000000000000000000001514310632100142125ustar00rootroot00000000000000xdp-tools-1.6.1/lib/Makefile000066400000000000000000000022201514310632100156460ustar00rootroot00000000000000 LIBBPF_CFLAGS:=$(if $(CFLAGS),$(CFLAGS),-g -O2 -Werror -Wall) -fPIC LIB_DIR = . include defines.mk SUBDIRS=util testing .PHONY: $(SUBDIRS) all: $(SUBDIRS) libxdp util: libxdp @echo; echo " $@"; $(MAKE) -C $@ testing: libxdp util @echo; echo " $@"; $(MAKE) -C $@ .PHONY: libxdp libxdp: $(OBJECT_LIBBPF) @echo; echo " $@"; $(MAKE) -C $@ .PHONY: clean clean: libbpf_clean @for i in $(SUBDIRS) libxdp; \ do $(MAKE) -C $$i clean; done .PHONY: install install: libxdp_install $(MAKE) -C testing install .PHONY: libxdp_install libxdp_install: libxdp install -m 0755 -d $(DESTDIR)$(HDRDIR) $(MAKE) -C libxdp install libbpf: $(OBJECT_LIBBPF) # Handle libbpf as git submodule ifeq ($(SYSTEM_LIBBPF),n) ifeq ($(VERBOSE),0) P:= >/dev/null endif # Detect submodule libbpf source file changes LIBBPF_SOURCES := $(wildcard libbpf/src/*.[ch]) .PHONY: libbpf_clean libbpf/src/libbpf.a: $(LIBBPF_SOURCES) @echo ; echo " libbpf" $(QUIET_CC)$(MAKE) -C libbpf/src CFLAGS="$(LIBBPF_CFLAGS)" $P $(QUIET_INSTALL)$(MAKE) -C libbpf/src install_headers DESTDIR=root PREFIX=/ $P libbpf_clean: $(Q)$(MAKE) -C libbpf/src clean $P else libbpf_clean: @echo -n endif xdp-tools-1.6.1/lib/README.org000066400000000000000000000003161514310632100156600ustar00rootroot00000000000000* Library files This directory contains common Makefile definitions, and common code used by the different utilities. The libbpf subdir is a git submodule linking to the upstream libbpf github repository. xdp-tools-1.6.1/lib/common.mk000066400000000000000000000104741514310632100160410ustar00rootroot00000000000000# Common Makefile parts for BPF-building with libbpf # -------------------------------------------------- # SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) # # This file should be included from your Makefile like: # LIB_DIR = ../lib/ # include $(LIB_DIR)/common.mk # # It is expected that you define the variables: # XDP_TARGETS and USER_TARGETS # as a space-separated list # XDP_C = ${XDP_TARGETS:=.c} XDP_OBJ = ${XDP_C:.c=.o} BPF_SKEL_OBJ = ${BPF_SKEL_TARGETS:=.o} BPF_SKEL_H = ${BPF_SKEL_OBJ:.bpf.o=.skel.h} USER_C := ${USER_TARGETS:=.c} USER_OBJ := ${USER_C:.c=.o} TEST_C := ${TEST_TARGETS:=.c} TEST_OBJ := ${TEST_C:.c=.o} XDP_OBJ_INSTALL ?= $(XDP_OBJ) MAN_FILES := $(MAN_PAGE) # Expect this is defined by including Makefile, but define if not LIB_DIR ?= ../lib LDLIBS ?= $(USER_LIBS) LDLIBS += -lm -lpthread include $(LIB_DIR)/defines.mk include $(LIB_DIR)/libxdp/libxdp.mk # get list of objects in util include $(LIB_DIR)/util/util.mk # Extend if including Makefile already added some LIB_OBJS += $(foreach obj,$(UTIL_OBJS),$(LIB_DIR)/util/$(obj)) EXTRA_DEPS += EXTRA_USER_DEPS += LDFLAGS+=-L$(LIBXDP_DIR) ifeq ($(DYNAMIC_LIBXDP),1) LDLIBS:=-lxdp $(LDLIBS) OBJECT_LIBXDP:=$(LIBXDP_DIR)/libxdp.so.$(LIBXDP_VERSION) else LDLIBS:=-l:libxdp.a $(LDLIBS) OBJECT_LIBXDP:=$(LIBXDP_DIR)/libxdp.a endif # Detect submodule libbpf source file changes ifeq ($(SYSTEM_LIBBPF),n) LIBBPF_SOURCES := $(wildcard $(LIBBPF_DIR)/src/*.[ch]) endif LIBXDP_SOURCES := $(wildcard $(LIBXDP_DIR)/*.[ch] $(LIBXDP_DIR)/*.in) # BPF-prog kern and userspace shares struct via header file: KERN_USER_H ?= $(wildcard common_kern_user.h) CFLAGS += -I$(HEADER_DIR) -I$(LIB_DIR)/util $(ARCH_INCLUDES) BPF_CFLAGS += -I$(HEADER_DIR) $(ARCH_INCLUDES) BPF_HEADERS := $(wildcard $(HEADER_DIR)/bpf/*.h) $(wildcard $(HEADER_DIR)/xdp/*.h) all: $(USER_TARGETS) $(XDP_OBJ) $(EXTRA_TARGETS) $(TEST_TARGETS) man .PHONY: clean clean:: $(Q)rm -f $(USER_TARGETS) $(XDP_OBJ) $(TEST_TARGETS) $(USER_OBJ) $(TEST_OBJ) $(USER_GEN) $(BPF_SKEL_H) *.ll .PHONY: install install: all install_local install -m 0755 -d $(DESTDIR)$(SBINDIR) install -m 0755 -d $(DESTDIR)$(BPF_OBJECT_DIR) $(if $(USER_TARGETS),install -m 0755 $(USER_TARGETS) $(DESTDIR)$(SBINDIR)) $(if $(XDP_OBJ_INSTALL),install -m 0644 $(XDP_OBJ_INSTALL) $(DESTDIR)$(BPF_OBJECT_DIR)) $(if $(MAN_FILES),install -m 0755 -d $(DESTDIR)$(MANDIR)/man8) $(if $(MAN_FILES),install -m 0644 $(MAN_FILES) $(DESTDIR)$(MANDIR)/man8) $(if $(SCRIPTS_FILES),install -m 0755 -d $(DESTDIR)$(SCRIPTSDIR)) $(if $(SCRIPTS_FILES),install -m 0755 $(SCRIPTS_FILES) $(DESTDIR)$(SCRIPTSDIR)) $(if $(TEST_FILE),install -m 0755 -d $(DESTDIR)$(SCRIPTSDIR)/tests/$(TOOL_NAME)) $(if $(TEST_FILE),install -m 0644 $(TEST_FILE) $(DESTDIR)$(SCRIPTSDIR)/tests/$(TOOL_NAME)) $(if $(TEST_FILE_DEPS),install -m 0644 $(TEST_FILE_DEPS) $(DESTDIR)$(SCRIPTSDIR)/tests/$(TOOL_NAME)) $(if $(TEST_TARGETS),install -m 0755 $(TEST_TARGETS) $(DESTDIR)$(SCRIPTSDIR)) .PHONY: install_local install_local:: $(OBJECT_LIBBPF): $(LIBBPF_SOURCES) $(Q)$(MAKE) -C $(LIB_DIR) libbpf $(OBJECT_LIBXDP): $(LIBXDP_SOURCES) $(Q)$(MAKE) -C $(LIBXDP_DIR) $(CONFIGMK): $(Q)$(MAKE) -C $(LIB_DIR)/.. config.mk # Create expansions for dependencies LIB_H := ${LIB_OBJS:.o=.h} # Detect if any of common obj changed and create dependency on .h-files $(LIB_OBJS): %.o: %.c %.h $(LIB_H) $(Q)$(MAKE) -C $(dir $@) $(notdir $@) ALL_EXEC_TARGETS=$(USER_TARGETS) $(TEST_TARGETS) $(ALL_EXEC_TARGETS): %: %.c $(OBJECT_LIBBPF) $(OBJECT_LIBXDP) $(LIBMK) $(LIB_OBJS) $(KERN_USER_H) $(EXTRA_DEPS) $(EXTRA_USER_DEPS) $(BPF_SKEL_H) $(USER_EXTRA_C) $(QUIET_CC)$(CC) -Wall $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o $@ $(LIB_OBJS) \ $< $(USER_EXTRA_C) $(LDLIBS) $(XDP_OBJ): %.o: %.c $(KERN_USER_H) $(EXTRA_DEPS) $(BPF_HEADERS) $(LIBMK) $(QUIET_CLANG)$(CLANG) -target $(BPF_TARGET) $(BPF_CFLAGS) -O2 -c -g -o $@ $< $(BPF_SKEL_H): %.skel.h: %.bpf.o $(QUIET_GEN)$(BPFTOOL) gen skeleton $< name $(notdir ${@:.skel.h=}) > $@ .PHONY: man ifeq ($(EMACS),) man: ; else man: $(MAN_PAGE) $(MAN_PAGE): README.org $(LIBMK) $(LIB_DIR)/export-man.el $(QUIET_GEN)$(EMACS) -Q --batch --load "$(LIB_DIR)/export-man.el" \ --eval "(export-man-page \"$@\" \"$<\" \"$(HAVE_FEATURES)\" \"v$(TOOLS_VERSION)\")" endif .PHONY: test ifeq ($(TEST_FILE),) test: @echo " No tests defined" else test: all $(Q)$(TEST_DIR)/test_runner.sh $(TEST_FILE) $(TESTS) endif xdp-tools-1.6.1/lib/defines.mk000066400000000000000000000026571514310632100161720ustar00rootroot00000000000000CFLAGS ?= -O2 -g BPF_CFLAGS ?= -Wall -Wno-unused-value -Wno-pointer-sign \ -Wno-compare-distinct-pointer-types \ -Wno-visibility -Werror -fno-stack-protector BPF_TARGET ?= bpf HAVE_FEATURES := include $(LIB_DIR)/../config.mk include $(LIB_DIR)/../version.mk PREFIX?=/usr/local LIBDIR?=$(PREFIX)/lib SBINDIR?=$(PREFIX)/sbin HDRDIR?=$(PREFIX)/include/xdp DATADIR?=$(PREFIX)/share RUNDIR?=/run MANDIR?=$(DATADIR)/man SCRIPTSDIR?=$(DATADIR)/xdp-tools BPF_DIR_MNT ?=/sys/fs/bpf BPF_OBJECT_DIR ?=$(LIBDIR)/bpf MAX_DISPATCHER_ACTIONS ?=10 HEADER_DIR = $(LIB_DIR)/../headers TEST_DIR = $(LIB_DIR)/testing LIBXDP_DIR := $(LIB_DIR)/libxdp LIBBPF_DIR := $(LIB_DIR)/libbpf DEFINES := -DBPF_DIR_MNT=\"$(BPF_DIR_MNT)\" -DBPF_OBJECT_PATH=\"$(BPF_OBJECT_DIR)\" \ -DMAX_DISPATCHER_ACTIONS=$(MAX_DISPATCHER_ACTIONS) -DTOOLS_VERSION=\"$(TOOLS_VERSION)\" \ -DLIBBPF_VERSION=\"$(LIBBPF_VERSION)\" -DRUNDIR=\"$(RUNDIR)\" DEFINES += $(foreach feat,$(HAVE_FEATURES),-DHAVE_$(feat)) ifneq ($(PRODUCTION),1) DEFINES += -DDEBUG endif ifeq ($(SYSTEM_LIBBPF),y) DEFINES += -DLIBBPF_DYNAMIC endif DEFINES += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 CFLAGS += -std=gnu11 -Wextra -Werror $(DEFINES) $(ARCH_INCLUDES) BPF_CFLAGS += $(DEFINES) $(filter -ffile-prefix-map=%,$(CFLAGS)) $(filter -I%,$(CFLAGS)) $(ARCH_INCLUDES) CONFIGMK := $(LIB_DIR)/../config.mk LIBMK := Makefile $(CONFIGMK) $(LIB_DIR)/defines.mk $(LIB_DIR)/common.mk $(LIB_DIR)/../version.mk xdp-tools-1.6.1/lib/export-man.el000066400000000000000000000057011514310632100166310ustar00rootroot00000000000000;;; export-man.el -- Export man page and filter result ;;; Commentary: ;;; ;;; Exports a man page and filters the result so we can exclude parts of the man ;;; page based on features enabled in the build system. ;;; ;;; The export-man-page function is called from common.mk with --eval ;;; Code: (require 'ox-man) (require 'parse-time) (defvar feature-exclude-tags '(("LIBBPF_PERF_BUFFER__CONSUME" . "feat_perfbuf")) "Mapping of feature strings to exclude tags for man page export.") (defvar feature-exclude-regexes '(("LIBBPF_PERF_BUFFER__CONSUME" . "--perf-wakeup")) "Mapping of feature strings to regexes to filter form export man page.") (defun get-feature-values (enabled-feats exclude-list) "Get feature-tag values for ENABLED-FEATS based on EXCLUDE-LIST." (delq nil (mapcar #'(lambda (f) (unless (member (car f) enabled-feats) (cdr f))) exclude-list))) (defun replace-regexp-in-buffer (regexp replace) "Replace REGEXP with REPLACE in buffer." (let ((case-fold-search nil)) (goto-char 0) (when (re-search-forward regexp nil t) (replace-match replace)))) (defun open-file (filename) "Find file FILENAME but complain if it doesn't exist." (if (file-exists-p filename) (find-file filename) (error "File not found: %s" filename))) (defun get-file-mod-time (filename) (let* ((file-modtime (file-attribute-modification-time (file-attributes filename))) (git-logtime (ignore-errors (shell-command-to-string (format "git log -1 --pretty='format:%%cI' -- %s" filename)))) (git-modtime (ignore-errors (parse-iso8601-time-string git-logtime)))) (or git-modtime file-modtime))) (defun filter-post-export (file feat-list version modtime) "Post-process exported FILE based on features in FEAT-LIST and VERSION." (let ((exclude-regexes (get-feature-values feat-list feature-exclude-regexes)) (date (format-time-string "%B %_d, %Y" modtime)) (make-backup-files nil)) (with-current-buffer (open-file file) (mapc #'(lambda (r) (delete-matching-lines r)) exclude-regexes) (replace-regexp-in-buffer "DATE" date) (replace-regexp-in-buffer "VERSION" version) (replace-regexp-in-buffer "^.SH \"\\([^\"]+\\) - \\([^\"]+\\)\"" ".SH \"NAME\"\n\\1 \\\\- \\2\n.SH \"SYNOPSIS\"") (delete-trailing-whitespace) (save-buffer)))) (defun export-man-page (outfile infile enabled-features version) "Export man page from INFILE into OUTFILE with ENABLED-FEATURES and VERSION." (let* ((feat-list (split-string enabled-features)) (org-export-exclude-tags (get-feature-values feat-list feature-exclude-tags)) (modtime (get-file-mod-time infile))) (with-current-buffer (open-file infile) (org-export-to-file 'man outfile) (filter-post-export outfile feat-list version modtime)))) (provide 'export-man) ;;; export-man.el ends here xdp-tools-1.6.1/lib/libbpf/000077500000000000000000000000001514310632100154505ustar00rootroot00000000000000xdp-tools-1.6.1/lib/libxdp/000077500000000000000000000000001514310632100154745ustar00rootroot00000000000000xdp-tools-1.6.1/lib/libxdp/.gitignore000066400000000000000000000000501514310632100174570ustar00rootroot00000000000000*.so.* *.a *.pc sharedobjs/ staticobjs/ xdp-tools-1.6.1/lib/libxdp/Makefile000066400000000000000000000136231514310632100171410ustar00rootroot00000000000000# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) LIB_DIR = .. include libxdp.mk include $(LIB_DIR)/defines.mk OBJDIR ?= . SHARED_OBJDIR := $(OBJDIR)/sharedobjs STATIC_OBJDIR := $(OBJDIR)/staticobjs OBJS := libxdp.o xsk.o XDP_OBJS := xdp-dispatcher.o xsk_def_xdp_prog.o xsk_def_xdp_prog_5.3.o EMBEDDED_XDP_OBJS := $(addsuffix .embed.o,$(basename $(XDP_OBJS))) SHARED_OBJS := $(addprefix $(SHARED_OBJDIR)/,$(OBJS)) STATIC_OBJS := $(addprefix $(STATIC_OBJDIR)/,$(OBJS)) $(EMBEDDED_XDP_OBJS) STATIC_LIBS := $(OBJDIR)/libxdp.a MAN_PAGE := libxdp.3 MAN_OBJ := ${MAN_PAGE:.3=.man} MAN_FILES := $(MAN_PAGE) TEST_DIR := tests SHARED_CFLAGS += -fPIC -DSHARED STATIC_CFLAGS += -D LIBXDP_STATIC=1 LIB_HEADERS := $(wildcard $(HEADER_DIR)/xdp/*.h) BPF_HEADERS := $(wildcard $(HEADER_DIR)/bpf/*.h) $(wildcard $(HEADER_DIR)/xdp/*.h) EXTRA_LIB_DEPS := $(OBJECT_LIBBPF) $(LIBMK) $(LIB_OBJS) $(LIB_HEADERS) compat.h libxdp_internal.h xsk_def_xdp_prog.h bpf_instr.h PC_FILE := $(OBJDIR)/libxdp.pc TEMPLATED_SOURCES := xdp-dispatcher.c CFLAGS += -I$(HEADER_DIR) BPF_CFLAGS += -I$(HEADER_DIR) $(ARCH_INCLUDES) ifndef BUILD_STATIC_ONLY SHARED_LIBS := $(OBJDIR)/libxdp.so \ $(OBJDIR)/libxdp.so.$(LIBXDP_MAJOR_VERSION) \ $(OBJDIR)/libxdp.so.$(LIBXDP_VERSION) VERSION_SCRIPT := libxdp.map CHECK_RULES := check_abi endif all: $(STATIC_LIBS) $(SHARED_LIBS) $(XDP_OBJS) $(PC_FILE) check man build_tests .PHONY: clean clean: $(Q)rm -f $(STATIC_LIBS) $(STATIC_OBJS) $(SHARED_LIBS) $(SHARED_OBJS) $(XDP_OBJS) $(PC_FILE) $(MAN_OBJ) $(TEMPLATED_SOURCES) *.ll $(Q)for d in $(SHARED_OBJDIR) $(STATIC_OBJDIR); do \ [ -d "$$d" ] && rmdir "$$d"; done || true $(Q)$(MAKE) -C $(TEST_DIR) clean .PHONY: install install: all $(Q)install -d -m 0755 $(DESTDIR)$(HDRDIR) $(Q)install -d -m 0755 $(DESTDIR)$(LIBDIR) $(Q)install -d -m 0755 $(DESTDIR)$(LIBDIR)/pkgconfig $(Q)install -d -m 0755 $(DESTDIR)$(BPF_OBJECT_DIR) $(Q)install -m 0644 $(LIB_HEADERS) $(DESTDIR)$(HDRDIR)/ $(Q)install -m 0644 $(PC_FILE) $(DESTDIR)$(LIBDIR)/pkgconfig/ $(Q)cp -fpR $(SHARED_LIBS) $(STATIC_LIBS) $(DESTDIR)$(LIBDIR) $(Q)install -m 0644 $(XDP_OBJS) $(DESTDIR)$(BPF_OBJECT_DIR) $(if $(MAN_FILES),$(Q)install -m 0755 -d $(DESTDIR)$(MANDIR)/man3) $(if $(MAN_FILES),$(Q)install -m 0644 $(MAN_FILES) $(DESTDIR)$(MANDIR)/man3) $(Q)$(MAKE) -C $(TEST_DIR) install $(OBJDIR)/libxdp.a: $(STATIC_OBJS) $(QUIET_LINK)$(AR) rcs $@ $^ $(OBJDIR)/libxdp.so: $(OBJDIR)/libxdp.so.$(LIBXDP_MAJOR_VERSION) $(Q)ln -sf $(^F) $@ $(OBJDIR)/libxdp.so.$(LIBXDP_MAJOR_VERSION): $(OBJDIR)/libxdp.so.$(LIBXDP_VERSION) $(Q)ln -sf $(^F) $@ $(OBJDIR)/libxdp.so.$(LIBXDP_VERSION): $(SHARED_OBJS) $(QUIET_LINK)$(CC) -shared -Wl,-soname,libxdp.so.$(LIBXDP_MAJOR_VERSION) \ -Wl,--version-script=$(VERSION_SCRIPT) \ $^ $(LDFLAGS) $(LDLIBS) -o $@ $(OBJDIR)/libxdp.pc: $(Q)sed -e "s|@PREFIX@|$(PREFIX)|" \ -e "s|@LIBDIR@|$(LIBDIR)|" \ -e "s|@VERSION@|$(TOOLS_VERSION)|" \ < libxdp.pc.template > $@ $(STATIC_OBJDIR): $(Q)mkdir -p $(STATIC_OBJDIR) $(SHARED_OBJDIR): $(Q)mkdir -p $(SHARED_OBJDIR) $(STATIC_OBJDIR)/%.o: %.c $(EXTRA_LIB_DEPS) | $(STATIC_OBJDIR) $(QUIET_CC)$(CC) $(CFLAGS) $(CPPFLAGS) $(STATIC_CFLAGS) -Wall -I../../headers -c $< -o $@ $(SHARED_OBJDIR)/%.o: %.c $(EXTRA_LIB_DEPS) | $(SHARED_OBJDIR) $(QUIET_CC)$(CC) $(CFLAGS) $(CPPFLAGS) $(SHARED_CFLAGS) -Wall -I../../headers -c $< -o $@ XDP_IN_SHARED := $(SHARED_OBJDIR)/libxdp.o $(SHARED_OBJDIR)/xsk.o GLOBAL_SYM_COUNT = $(shell $(READELF) -s --wide $(XDP_IN_SHARED) | \ cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' | \ sed 's/\[.*\]//' | \ awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}' | \ sort -u | wc -l) VERSIONED_SYM_COUNT = $(shell $(READELF) --dyn-syms --wide $(OBJDIR)/libxdp.so | \ grep -Eo '[^ ]+@LIBXDP_' | cut -d@ -f1 | sort -u | wc -l) check: $(CHECK_RULES) check_abi: $(OBJDIR)/libxdp.so @if [ "$(GLOBAL_SYM_COUNT)" != "$(VERSIONED_SYM_COUNT)" ]; then \ echo "Warning: Num of global symbols in $(XDP_IN_SHARED)" \ "($(GLOBAL_SYM_COUNT)) does NOT match with num of" \ "versioned symbols in $^ ($(VERSIONED_SYM_COUNT))." \ "Please make sure all symbols are" \ "versioned in $(VERSION_SCRIPT)." >&2; \ $(READELF) -s --wide $(XDP_IN_SHARED) | \ cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' | \ sed 's/\[.*\]//' | \ awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$NF}'| \ sort -u > $(OUTPUT)libxdp_global_syms.tmp; \ $(READELF) --dyn-syms --wide $(OUTPUT)libxdp.so | \ grep -Eo '[^ ]+@LIBXDP_' | cut -d@ -f1 | \ sort -u > $(OUTPUT)libxdp_versioned_syms.tmp; \ diff -u $(OUTPUT)libxdp_global_syms.tmp \ $(OUTPUT)libxdp_versioned_syms.tmp; \ rm $(OUTPUT)libxdp_global_syms.tmp \ $(OUTPUT)libxdp_versioned_syms.tmp; \ exit 1; \ fi $(TEMPLATED_SOURCES): %.c: %.c.in Makefile $(QUIET_M4)$(M4) $(DEFINES) $< > $@ || ( ret=$$?; rm -f $@; exit $$ret ) $(EMBEDDED_XDP_OBJS): %.embed.o: %.o $(QUIET_CC)$(CC) -r -nostdlib -Wl,-z,noexecstack,--format=binary $(LDFLAGS) -o $@ $< $(Q)$(OBJCOPY) --rename-section .data=.rodata,alloc,load,readonly,data,contents $@ $(XDP_OBJS): %.o: %.c $(BPF_HEADERS) $(LIBMK) $(QUIET_CLANG)$(CLANG) -target $(BPF_TARGET) $(BPF_CFLAGS) -O2 -c -g -o $@ $< .PHONY: man ifeq ($(EMACS),) man: ; else man: $(MAN_PAGE) $(MAN_OBJ): README.org $(LIBMK) $(Q)$(EMACS) -Q --batch --find-file $< --eval "(progn (require 'ox-man)(org-man-export-to-man))" $(MAN_PAGE): $(MAN_OBJ) $(LIBMK) $(QUIET_GEN)MODDATE=$$(git log -1 --pretty="format:%cI" README.org 2>/dev/null); \ [ "$$?" -eq "0" ] && DATE=$$(date '+%B %_d, %Y' -d "$$MODDATE") || DATE=$$(date '+%B %_d, %Y'); \ sed -e "1 s/DATE/$$DATE/" -e "1 s/VERSION/v$(TOOLS_VERSION)/" -e '1,5 s/^.SH "\([^"]\+\) - \([^"]\+\)"/.SH "NAME"\n\1 \\- \2\n.SH "SYNOPSIS"/' $< > $@ endif .PHONY: build_tests build_tests: $(SHARED_LIBS) $(STATIC_LIBS) $(Q)$(MAKE) -C $(TEST_DIR) .PHONY: test test: all $(Q)$(MAKE) -C $(TEST_DIR) run xdp-tools-1.6.1/lib/libxdp/README.org000066400000000000000000000553771514310632100171630ustar00rootroot00000000000000#+EXPORT_FILE_NAME: libxdp #+TITLE: libxdp #+OPTIONS: ^:nil #+MAN_CLASS_OPTIONS: :section-id "3\" \"DATE\" \"VERSION\" \"libxdp - library for loading XDP programs" # This file serves both as a README on github, and as the source for the man # page; the latter through the org-mode man page export support. # . # To export the man page, simply use the org-mode exporter; (require 'ox-man) if # it's not available. There's also a Makefile rule to export it. * libxdp - library for attaching XDP programs and using AF_XDP sockets This directory contains the files for the =libxdp= library for attaching XDP programs to network interfaces and using AF_XDP sockets. The library is fairly lightweight and relies on =libbpf= to do the heavy lifting for processing eBPF object files etc. =Libxdp= provides two primary features on top of =libbpf=. The first is the ability to load multiple XDP programs in sequence on a single network device (which is not natively supported by the kernel). This support relies on the =freplace= functionality in the kernel, which makes it possible to attach an eBPF program as a replacement for a global function in another (already loaded) eBPF program. The second main feature is helper functions for configuring AF_XDP sockets as well as reading and writing packets from these sockets. Some of the functionality provided by libxdp depends on particular kernel features; see the "Kernel feature compatibility" section below for details. ** Using libxdp from an application Basic usage of libxdp from an application is quite straight forward. The following example loads, then unloads, an XDP program from the 'lo' interface: #+begin_src C #define IFINDEX 1 struct xdp_program *prog; int err; prog = xdp_program__open_file("my-program.o", "section_name", NULL); err = xdp_program__attach(prog, IFINDEX, XDP_MODE_NATIVE, 0); if (!err) xdp_program__detach(prog, IFINDEX, XDP_MODE_NATIVE, 0); xdp_program__close(prog); #+end_src The =xdp_program= structure is an opaque structure that represents a single XDP program. =libxdp= contains functions to create such a struct either from a BPF object file on disk, from a =libbpf= BPF object, or from an identifier of a program that is already loaded into the kernel: #+begin_src C struct xdp_program *xdp_program__from_bpf_obj(struct bpf_object *obj, const char *section_name); struct xdp_program *xdp_program__find_file(const char *filename, const char *section_name, struct bpf_object_open_opts *opts); struct xdp_program *xdp_program__open_file(const char *filename, const char *section_name, struct bpf_object_open_opts *opts); struct xdp_program *xdp_program__from_fd(int fd); struct xdp_program *xdp_program__from_id(__u32 prog_id); struct xdp_program *xdp_program__from_pin(const char *pin_path); #+end_src The functions that open a BPF object or file need the function name of the XDP program as well as the file name or object, since an ELF file can contain multiple XDP programs. The =xdp_program__find_file()= function takes a filename without a path, and will look for the object in =LIBXDP_OBJECT_PATH= which defaults to =/usr/lib/bpf= (alternatively =/usr/lib64/bpf= or =/usr/local/lib/bpf=, depending on which =PREFIX= libxdp was compiled with). This is convenient for applications shipping pre-compiled eBPF object files. When libxdp is compiled with the DEBUG flag, it will additionally look for BPF object files in the current directory, before checking the system-wide directory. This can be convenient during development to avoid older versions of a BPF program being loaded. However, it should not be used in production, as the shadowing of system BPF programs can be a security issue. The =xdp_program__attach()= function will attach the program to an interface, building a dispatcher program to execute it. Multiple programs can be attached at once with =xdp_program__attach_multi()=; they will be sorted in order of their run priority, and execution from one program to the next will proceed based on the chain call actions defined for each program (see the *Program metadata* section below). Because the loading process involves modifying the attach type of the program, the attach functions only work with =struct xdp_program= objects that have not yet been loaded into the kernel. When using the attach functions to attach to an interface that already has an XDP program loaded, libxdp will attempt to add the program to the list of loaded programs. However, this may fail, either due to missing kernel support, or because the already-attached program was not loaded using a dispatcher compatible with libxdp. If the kernel support for incremental attach (merged in kernel 5.10) is missing, the only way to actually run multiple programs on a single interface is to attach them all at the same time with =xdp_program__attach_multi()=. If the existing program is not an XDP dispatcher, that program will have to be detached from the interface before libxdp can attach a new one. This can be done by calling =xdp_program__detach()= with a reference to the loaded program; but note that this will of course break any application relying on that other XDP program to be present. * Program metadata To support multiple XDP programs on the same interface, libxdp uses two pieces of metadata for each XDP program: Run priority and chain call actions. *** Run priority This is the priority of the program and is a simple integer used to sort programs when loading multiple programs onto the same interface. Programs that wish to run early (such as a packet filter) should set low values for this, while programs that want to run later (such as a packet forwarder or counter) should set higher values. Note that later programs are only run if the previous programs end with a return code that is part of its chain call actions (see below). If not specified, the default priority value is 50. *** Chain call actions These are the program return codes that the program indicate for packets that should continue processing. If the program returns one of these actions, later programs in the call chain will be run, whereas if it returns any other action, processing will be interrupted, and the XDP dispatcher will return the verdict immediately. If not set, this defaults to just XDP_PASS, which is likely the value most programs should use. *** Specifying metadata The metadata outlined above is specified as BTF information embedded in the ELF file containing the XDP program. The =xdp_helpers.h= file shipped with libxdp contains helper macros to include this information, which can be used as follows: #+begin_src C #include #include struct { __uint(priority, 10); __uint(XDP_PASS, 1); __uint(XDP_DROP, 1); } XDP_RUN_CONFIG(my_xdp_func); #+end_src This example specifies that the XDP program in =my_xdp_func= should have priority 10 and that its chain call actions are =XDP_PASS= and =XDP_DROP=. In a source file with multiple XDP programs in the same file, a definition like the above can be included for each program (main XDP function). Any program that does not specify any config information will use the default values outlined above. *** Inspecting and modifying metadata =libxdp= exposes the following functions that an application can use to inspect and modify the metadata on an XDP program. Modification is only possible before a program is attached on an interface. These functions won't modify the BTF information itself, but the new values will be stored as part of the program attachment. #+begin_src C unsigned int xdp_program__run_prio(const struct xdp_program *xdp_prog); int xdp_program__set_run_prio(struct xdp_program *xdp_prog, unsigned int run_prio); bool xdp_program__chain_call_enabled(const struct xdp_program *xdp_prog, enum xdp_action action); int xdp_program__set_chain_call_enabled(struct xdp_program *prog, unsigned int action, bool enabled); int xdp_program__print_chain_call_actions(const struct xdp_program *prog, char *buf, size_t buf_len); #+end_src * The dispatcher program To support multiple non-offloaded programs on the same network interface, =libxdp= uses a *dispatcher program* which is a small wrapper program that will call each component program in turn, expect the return code, and then chain call to the next program based on the chain call actions of the previous program (see the *Program metadata* section above). While applications using =libxdp= do not need to know the details of the dispatcher program to just load an XDP program unto an interface, =libxdp= does expose the dispatcher and its attached component programs, which can be used to list the programs currently attached to an interface. The structure used for this is =struct xdp_multiprog=, which can only be constructed from the programs loaded on an interface based on ifindex. The API for getting a multiprog reference and iterating through the attached programs looks like this: #+begin_src C struct xdp_multiprog *xdp_multiprog__get_from_ifindex(int ifindex); struct xdp_program *xdp_multiprog__next_prog(const struct xdp_program *prog, const struct xdp_multiprog *mp); void xdp_multiprog__close(struct xdp_multiprog *mp); int xdp_multiprog__detach(struct xdp_multiprog *mp, int ifindex); enum xdp_attach_mode xdp_multiprog__attach_mode(const struct xdp_multiprog *mp); struct xdp_program *xdp_multiprog__main_prog(const struct xdp_multiprog *mp); struct xdp_program *xdp_multiprog__hw_prog(const struct xdp_multiprog *mp); bool xdp_multiprog__is_legacy(const struct xdp_multiprog *mp); #+end_src If a non-offloaded program is attached to the interface which =libxdp= doesn't recognise as a dispatcher program, an =xdp_multiprog= structure will still be returned, and =xdp_multiprog__is_legacy()= will return true for that program (note that this also holds true if only an offloaded program is loaded). A reference to that (regular) XDP program can be obtained by =xdp_multiprog__main_prog()=. If the program attached to the interface *is* a dispatcher program, =xdp_multiprog__main_prog()= will return a reference to the dispatcher program itself, which is mainly useful for obtaining other data about that program (such as the program ID). A reference to an offloaded program can be acquired using =xdp_multiprog_hw_prog()=. Function =xdp_multiprog__attach_mode()= returns the attach mode of the non-offloaded program, whether an offloaded program is attached should be checked through =xdp_multiprog_hw_prog()=. ** Pinning in bpffs The kernel will automatically detach component programs from the dispatcher once the last reference to them disappears. To prevent this from happening, =libxdp= will pin the component program references in =bpffs= before attaching the dispatcher to the network interface. The pathnames generated for pinning is as follows: - /sys/fs/bpf/xdp/dispatch-IFINDEX-DID - dispatcher program for IFINDEX with BPF program ID DID - /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-prog - component program 0, program reference - /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-link - component program 0, bpf_link reference - /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-prog - component program 1, program reference - /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-link - component program 1, bpf_link reference - etc, up to ten component programs If set, the =LIBXDP_BPFFS= environment variable will override the location of =bpffs=, but the =xdp= subdirectory is always used. If no =bpffs= is mounted, libxdp will consult the environment variable =LIBXDP_BPFFS_AUTOMOUNT=. If this is set to =1=, libxdp will attempt to automount a bpffs. If not, libxdp will fall back to loading a single program without a dispatcher, as if the kernel did not support the features needed for multiprog attachment. * Using AF_XDP sockets Libxdp implements helper functions for configuring AF_XDP sockets as well as reading and writing packets from these sockets. AF_XDP sockets can be used to redirect packets to user-space at high rates from an XDP program. Note that this functionality used to reside in libbpf, but has now been moved over to libxdp as it is a better fit for this library. As of the 1.0 release of libbpf, the AF_XDP socket support will be removed and all future development will be performed in libxdp instead. For an overview of AF_XDP sockets, please refer to this Linux Plumbers paper (http://vger.kernel.org/lpc_net2018_talks/lpc18_pres_af_xdp_perf-v3.pdf) and the documentation in the Linux kernel (Documentation/networking/af_xdp.rst or https://www.kernel.org/doc/html/latest/networking/af_xdp.html). For an example on how to use the interface, take a look at the AF_XDP-example and AF_XDP-forwarding programs in the bpf-examples repository: https://github.com/xdp-project/bpf-examples. ** Control path Libxdp provides helper functions for creating and destroying umems and sockets as shown below. The first thing that a user generally wants to do is to create a umem area. This is the area that will contain all packets received and the ones that are going to be sent. After that, AF_XDP sockets can be created tied to this umem. These can either be sockets that have exclusive ownership of that umem through xsk_socket__create() or shared with other sockets using xsk_socket__create_shared. There is one option called XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD that can be set in the libxdp_flags field (also called libbpf_flags for compatibility reasons). This will make libxdp not load any XDP program or set and BPF maps which is a must if users want to add their own XDP program. If there is already a socket created with socket(AF_XDP, SOCK_RAW, 0) not bound and not tied to any umem, file descriptor of this socket can be used in param opts of xsk_umem__create_opts(), which is a recommended way of umem creation. #+begin_src C struct xsk_umem *xsk_umem__create_opts(void *umem_area, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, struct xsk_umem_opts *opts); int xsk_umem__create(struct xsk_umem **umem, void *umem_area, __u64 size, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, const struct xsk_umem_config *config); int xsk_umem__create_with_fd(struct xsk_umem **umem, int fd, void *umem_area, __u64 size, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, const struct xsk_umem_config *config); int xsk_socket__create(struct xsk_socket **xsk, const char *ifname, __u32 queue_id, struct xsk_umem *umem, struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, const struct xsk_socket_config *config); int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, const char *ifname, __u32 queue_id, struct xsk_umem *umem, struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, const struct xsk_socket_config *config); int xsk_umem__delete(struct xsk_umem *umem); void xsk_socket__delete(struct xsk_socket *xsk); #+end_src There are also two helper function to get the file descriptor of a umem or a socket. These are needed when using standard Linux syscalls such as poll(), recvmsg(), sendto(), etc. #+begin_src C int xsk_umem__fd(const struct xsk_umem *umem); int xsk_socket__fd(const struct xsk_socket *xsk); #+end_src The control path also provides two APIs for setting up AF_XDP sockets when the process that is going to use the AF_XDP socket is non-privileged. These two functions perform the operations that require privileges and can be executed from some form of control process that has the necessary privileges. The xsk_socket__create executed on the non-privileged process will then skip these two steps. For an example on how to use these, please take a look at the AF_XDP-example program in the bpf-examples repository: https://github.com/xdp-project/bpf-examples/tree/main/AF_XDP-example. #+begin_src C int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd); int xsk_socket__update_xskmap(struct xsk_socket *xsk, int xsks_map_fd); #+end_src To further reduce required level of privileges, an AF_XDP socket can be created beforehand with socket(AF_XDP, SOCK_RAW, 0) and passed to a non-privileged process. This socket can be used in xsk_umem__create_opts() and later in xsk_socket__create() with created umem. xsk_socket__create_shared() would still require privileges for AF_XDP socket creation. ** Data path For performance reasons, all the data path functions are static inline functions found in the xsk.h header file so they can be optimized into the target application binary for best possible performance. There are four FIFO rings of two main types: producer rings (fill and Tx) and consumer rings (Rx and completion). The producer rings use xsk_ring_prod functions and consumer rings use xsk_ring_cons functions. For producer rings, you start with =reserving= one or more slots in a producer ring and then when they have been filled out, you =submit= them so that the kernel will act on them. For a consumer ring, you =peek= if there are any new packets in the ring and if so you can read them from the ring. Once you are done reading them, you =release= them back to the kernel so it can use them for new packets. There is also a =cancel= operation for consumer rings if the application does not want to consume all packets received with the peek operation. #+begin_src C __u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, __u32 *idx); void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb); __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx); void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, __u32 nb); void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb); #+end_src The functions below are used for reading and writing the descriptors of the rings. xsk_ring_prod__fill_addr() and xsk_ring_prod__tx_desc() *writes* entries in the fill and Tx rings respectively, while xsk_ring_cons__comp_addr and xsk_ring_cons__rx_desc *reads* entries from the completion and Rx rings respectively. The =idx= is the parameter returned in the xsk_ring_prod__reserve or xsk_ring_cons__peek calls. To advance to the next entry, simply do =idx++=. #+begin_src C __u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill, __u32 idx); struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx, __u32 idx); const __u64 *xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx); const struct xdp_desc *xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx); #+end_src The xsk_umem functions are used to get a pointer to the packet data itself, always located inside the umem. In the default aligned mode, you can get the addr variable straight from the Rx descriptor. But in unaligned mode, you need to use the three last function below as the offset used is carried in the upper 16 bits of the addr. Therefore, you cannot use the addr straight from the descriptor in the unaligned case. #+begin_src C void *xsk_umem__get_data(void *umem_area, __u64 addr); __u64 xsk_umem__extract_addr(__u64 addr); __u64 xsk_umem__extract_offset(__u64 addr); __u64 xsk_umem__add_offset_to_addr(__u64 addr); #+end_src There is one more function in the data path and that checks if the need_wakeup flag is set. Use of this flag is highly encouraged and should be enabled by setting =XDP_USE_NEED_WAKEUP= bit in the =xdp_bind_flags= field that is provided to the xsk_socket_create_[shared]() calls. If this function returns true, then you need to call =recvmsg()=, =sendto()=, or =poll()= depending on the situation. =recvmsg()= if you are *receiving*, or =sendto()= if you are *sending*. =poll()= can be used for both cases and provide the ability to sleep too, as with any other socket. But note that poll is a slower operation than the other two. #+begin_src C int xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r); #+end_src For an example on how to use all these APIs, take a look at the AF_XDP-example and AF_XDP-forwarding programs in the bpf-examples repository: https://github.com/xdp-project/bpf-examples. * Kernel and BPF program feature compatibility The features exposed by libxdp relies on certain kernel versions and BPF features to work. To get the full benefit of all features, libxdp needs to be used with kernel 5.10 or newer, unless the commits mentioned below have been backported. However, libxdp will probe the kernel and transparently fall back to legacy loading procedures, so it is possible to use the library with older versions, although some features will be unavailable, as detailed below. The ability to attach multiple BPF programs to a single interface relies on the kernel "BPF program extension" feature which was introduced by commit be8704ff07d2 ("bpf: Introduce dynamic program extensions") in the upstream kernel and first appeared in kernel release 5.6. To *incrementally* attach multiple programs, a further refinement added by commit 4a1e7c0c63e0 ("bpf: Support attaching freplace programs to multiple attach points") is needed; this first appeared in the upstream kernel version 5.10. The functionality relies on the "BPF trampolines" feature which is available on architectures that support it. The support matrix is as follows: - Kernels before 5.6 can only attach a single XDP program to each interface - Kernels 5.6+ can attach multiple programs if they are all attached at the same time - Kernels 5.10+ have full support for XDP multiprog on architectures supporting BPF trampolines - On architectures without BPF trampoline support, only a single program can be attached to each interface To load AF_XDP programs, kernel support for AF_XDP sockets needs to be included and enabled in the kernel build. In addition, when using AF_XDP sockets, an XDP program is also loaded on the interface. The XDP program used for this by libxdp requires the ability to do map lookups into XSK maps, which was introduced with commit fada7fdc83c0 ("bpf: Allow bpf_map_lookup_elem() on an xskmap") in kernel 5.3. This means that the minimum required kernel version for using AF_XDP is kernel 5.3; however, for the AF_XDP XDP program to co-exist with other programs, the same constraints for multiprog applies as outlined above. Note that some Linux distributions backport features to earlier kernel versions, especially in enterprise kernels; for instance, Red Hat Enterprise Linux kernels include everything needed for libxdp to function since RHEL 8.5. Finally, XDP programs loaded using the multiprog facility must include type information (using the BPF Type Format, BTF). To get this, compile the programs with a recent version of Clang/LLVM (version 10+), and enable debug information when compiling (using the =-g= option). * BUGS Please report any bugs on Github: https://github.com/xdp-project/xdp-tools/issues * AUTHORS libxdp and this man page were written by Toke Høiland-Jørgensen. AF_XDP support and documentation was contributed by Magnus Karlsson. xdp-tools-1.6.1/lib/libxdp/bpf_instr.h000066400000000000000000000060651514310632100176420ustar00rootroot00000000000000/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ #ifndef __BPF_INSTR_H #define __BPF_INSTR_H #include #define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ ((struct bpf_insn) { \ .code = CODE, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = IMM }) #define BPF_ALU64_IMM(OP, DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) #define BPF_MOV64_IMM(DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) #define BPF_EXIT_INSN() \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_EXIT, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = 0 }) #define BPF_EMIT_CALL(FUNC) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = ((FUNC) - BPF_FUNC_unspec) }) #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ ((struct bpf_insn) { \ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) #define BPF_MOV64_REG(DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) #define BPF_MOV32_IMM(DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) #define BPF_LD_IMM64_RAW_FULL(DST, SRC, OFF1, OFF2, IMM1, IMM2) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_DW | BPF_IMM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF1, \ .imm = IMM1 }), \ ((struct bpf_insn) { \ .code = 0, \ .dst_reg = 0, \ .src_reg = 0, \ .off = OFF2, \ .imm = IMM2 }) #define BPF_LD_MAP_FD(DST, MAP_FD) \ BPF_LD_IMM64_RAW_FULL(DST, BPF_PSEUDO_MAP_FD, 0, 0, \ MAP_FD, 0) #define BPF_LD_MAP_VALUE(DST, MAP_FD, VALUE_OFF) \ BPF_LD_IMM64_RAW_FULL(DST, BPF_PSEUDO_MAP_VALUE, 0, 0, \ MAP_FD, VALUE_OFF) #define BPF_JMP_IMM(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) #define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) #endif xdp-tools-1.6.1/lib/libxdp/compat.h000066400000000000000000000004041514310632100171260ustar00rootroot00000000000000#ifndef __COMPAT_H #define __COMPAT_H #ifndef HAVE_SECURE_GETENV #include // Source: https://www.openwall.com/lists/musl/2019/05/28/3 static inline char *secure_getenv(const char *name) { return libc.secure ? NULL : getenv(name); } #endif #endif xdp-tools-1.6.1/lib/libxdp/libxdp.3000066400000000000000000000573601514310632100170550ustar00rootroot00000000000000.TH "libxdp" "3" "January 15, 2026" "v1.6.1" "libxdp - library for loading XDP programs" .SH "NAME" libxdp \- library for attaching XDP programs and using AF_XDP sockets .SH "SYNOPSIS" .PP This directory contains the files for the \fIlibxdp\fP library for attaching XDP programs to network interfaces and using AF_XDP sockets. The library is fairly lightweight and relies on \fIlibbpf\fP to do the heavy lifting for processing eBPF object files etc. .PP \fILibxdp\fP provides two primary features on top of \fIlibbpf\fP. The first is the ability to load multiple XDP programs in sequence on a single network device (which is not natively supported by the kernel). This support relies on the \fIfreplace\fP functionality in the kernel, which makes it possible to attach an eBPF program as a replacement for a global function in another (already loaded) eBPF program. The second main feature is helper functions for configuring AF_XDP sockets as well as reading and writing packets from these sockets. .PP Some of the functionality provided by libxdp depends on particular kernel features; see the "Kernel feature compatibility" section below for details. .SS "Using libxdp from an application" .PP Basic usage of libxdp from an application is quite straight forward. The following example loads, then unloads, an XDP program from the 'lo' interface: .RS .nf \fC#define IFINDEX 1 struct xdp_program *prog; int err; prog = xdp_program__open_file("my-program.o", "section_name", NULL); err = xdp_program__attach(prog, IFINDEX, XDP_MODE_NATIVE, 0); if (!err) xdp_program__detach(prog, IFINDEX, XDP_MODE_NATIVE, 0); xdp_program__close(prog); \fP .fi .RE .PP The \fIxdp_program\fP structure is an opaque structure that represents a single XDP program. \fIlibxdp\fP contains functions to create such a struct either from a BPF object file on disk, from a \fIlibbpf\fP BPF object, or from an identifier of a program that is already loaded into the kernel: .RS .nf \fCstruct xdp_program *xdp_program__from_bpf_obj(struct bpf_object *obj, const char *section_name); struct xdp_program *xdp_program__find_file(const char *filename, const char *section_name, struct bpf_object_open_opts *opts); struct xdp_program *xdp_program__open_file(const char *filename, const char *section_name, struct bpf_object_open_opts *opts); struct xdp_program *xdp_program__from_fd(int fd); struct xdp_program *xdp_program__from_id(__u32 prog_id); struct xdp_program *xdp_program__from_pin(const char *pin_path); \fP .fi .RE .PP The functions that open a BPF object or file need the function name of the XDP program as well as the file name or object, since an ELF file can contain multiple XDP programs. The \fIxdp_program__find_file()\fP function takes a filename without a path, and will look for the object in \fILIBXDP_OBJECT_PATH\fP which defaults to \fI/usr/lib/bpf\fP (alternatively \fI/usr/lib64/bpf\fP or \fI/usr/local/lib/bpf\fP, depending on which \fIPREFIX\fP libxdp was compiled with). This is convenient for applications shipping pre-compiled eBPF object files. .PP When libxdp is compiled with the DEBUG flag, it will additionally look for BPF object files in the current directory, before checking the system-wide directory. This can be convenient during development to avoid older versions of a BPF program being loaded. However, it should not be used in production, as the shadowing of system BPF programs can be a security issue. .PP The \fIxdp_program__attach()\fP function will attach the program to an interface, building a dispatcher program to execute it. Multiple programs can be attached at once with \fIxdp_program__attach_multi()\fP; they will be sorted in order of their run priority, and execution from one program to the next will proceed based on the chain call actions defined for each program (see the \fBProgram metadata\fP section below). Because the loading process involves modifying the attach type of the program, the attach functions only work with \fIstruct xdp_program\fP objects that have not yet been loaded into the kernel. .PP When using the attach functions to attach to an interface that already has an XDP program loaded, libxdp will attempt to add the program to the list of loaded programs. However, this may fail, either due to missing kernel support, or because the already-attached program was not loaded using a dispatcher compatible with libxdp. If the kernel support for incremental attach (merged in kernel 5.10) is missing, the only way to actually run multiple programs on a single interface is to attach them all at the same time with \fIxdp_program__attach_multi()\fP. If the existing program is not an XDP dispatcher, that program will have to be detached from the interface before libxdp can attach a new one. This can be done by calling \fIxdp_program__detach()\fP with a reference to the loaded program; but note that this will of course break any application relying on that other XDP program to be present. .SH "Program metadata" .PP To support multiple XDP programs on the same interface, libxdp uses two pieces of metadata for each XDP program: Run priority and chain call actions. .SS "Run priority" .PP This is the priority of the program and is a simple integer used to sort programs when loading multiple programs onto the same interface. Programs that wish to run early (such as a packet filter) should set low values for this, while programs that want to run later (such as a packet forwarder or counter) should set higher values. Note that later programs are only run if the previous programs end with a return code that is part of its chain call actions (see below). If not specified, the default priority value is 50. .SS "Chain call actions" .PP These are the program return codes that the program indicate for packets that should continue processing. If the program returns one of these actions, later programs in the call chain will be run, whereas if it returns any other action, processing will be interrupted, and the XDP dispatcher will return the verdict immediately. If not set, this defaults to just XDP_PASS, which is likely the value most programs should use. .SS "Specifying metadata" .PP The metadata outlined above is specified as BTF information embedded in the ELF file containing the XDP program. The \fIxdp_helpers.h\fP file shipped with libxdp contains helper macros to include this information, which can be used as follows: .RS .nf \fC#include #include struct { __uint(priority, 10); __uint(XDP_PASS, 1); __uint(XDP_DROP, 1); } XDP_RUN_CONFIG(my_xdp_func); \fP .fi .RE .PP This example specifies that the XDP program in \fImy_xdp_func\fP should have priority 10 and that its chain call actions are \fIXDP_PASS\fP and \fIXDP_DROP\fP. In a source file with multiple XDP programs in the same file, a definition like the above can be included for each program (main XDP function). Any program that does not specify any config information will use the default values outlined above. .SS "Inspecting and modifying metadata" .PP \fIlibxdp\fP exposes the following functions that an application can use to inspect and modify the metadata on an XDP program. Modification is only possible before a program is attached on an interface. These functions won't modify the BTF information itself, but the new values will be stored as part of the program attachment. .RS .nf \fCunsigned int xdp_program__run_prio(const struct xdp_program *xdp_prog); int xdp_program__set_run_prio(struct xdp_program *xdp_prog, unsigned int run_prio); bool xdp_program__chain_call_enabled(const struct xdp_program *xdp_prog, enum xdp_action action); int xdp_program__set_chain_call_enabled(struct xdp_program *prog, unsigned int action, bool enabled); int xdp_program__print_chain_call_actions(const struct xdp_program *prog, char *buf, size_t buf_len); \fP .fi .RE .SH "The dispatcher program" .PP To support multiple non-offloaded programs on the same network interface, \fIlibxdp\fP uses a \fBdispatcher program\fP which is a small wrapper program that will call each component program in turn, expect the return code, and then chain call to the next program based on the chain call actions of the previous program (see the \fBProgram metadata\fP section above). .PP While applications using \fIlibxdp\fP do not need to know the details of the dispatcher program to just load an XDP program unto an interface, \fIlibxdp\fP does expose the dispatcher and its attached component programs, which can be used to list the programs currently attached to an interface. .PP The structure used for this is \fIstruct xdp_multiprog\fP, which can only be constructed from the programs loaded on an interface based on ifindex. The API for getting a multiprog reference and iterating through the attached programs looks like this: .RS .nf \fCstruct xdp_multiprog *xdp_multiprog__get_from_ifindex(int ifindex); struct xdp_program *xdp_multiprog__next_prog(const struct xdp_program *prog, const struct xdp_multiprog *mp); void xdp_multiprog__close(struct xdp_multiprog *mp); int xdp_multiprog__detach(struct xdp_multiprog *mp, int ifindex); enum xdp_attach_mode xdp_multiprog__attach_mode(const struct xdp_multiprog *mp); struct xdp_program *xdp_multiprog__main_prog(const struct xdp_multiprog *mp); struct xdp_program *xdp_multiprog__hw_prog(const struct xdp_multiprog *mp); bool xdp_multiprog__is_legacy(const struct xdp_multiprog *mp); \fP .fi .RE .PP If a non-offloaded program is attached to the interface which \fIlibxdp\fP doesn't recognise as a dispatcher program, an \fIxdp_multiprog\fP structure will still be returned, and \fIxdp_multiprog__is_legacy()\fP will return true for that program (note that this also holds true if only an offloaded program is loaded). A reference to that (regular) XDP program can be obtained by \fIxdp_multiprog__main_prog()\fP. If the program attached to the interface \fBis\fP a dispatcher program, \fIxdp_multiprog__main_prog()\fP will return a reference to the dispatcher program itself, which is mainly useful for obtaining other data about that program (such as the program ID). A reference to an offloaded program can be acquired using \fIxdp_multiprog_hw_prog()\fP. Function \fIxdp_multiprog__attach_mode()\fP returns the attach mode of the non-offloaded program, whether an offloaded program is attached should be checked through \fIxdp_multiprog_hw_prog()\fP. .SS "Pinning in bpffs" .PP The kernel will automatically detach component programs from the dispatcher once the last reference to them disappears. To prevent this from happening, \fIlibxdp\fP will pin the component program references in \fIbpffs\fP before attaching the dispatcher to the network interface. The pathnames generated for pinning is as follows: .IP \(em 4 /sys/fs/bpf/xdp/dispatch-IFINDEX-DID - dispatcher program for IFINDEX with BPF program ID DID .IP \(em 4 /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-prog - component program 0, program reference .IP \(em 4 /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-link - component program 0, bpf_link reference .IP \(em 4 /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-prog - component program 1, program reference .IP \(em 4 /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-link - component program 1, bpf_link reference .IP \(em 4 etc, up to ten component programs .PP If set, the \fILIBXDP_BPFFS\fP environment variable will override the location of \fIbpffs\fP, but the \fIxdp\fP subdirectory is always used. If no \fIbpffs\fP is mounted, libxdp will consult the environment variable \fILIBXDP_BPFFS_AUTOMOUNT\fP. If this is set to \fI1\fP, libxdp will attempt to automount a bpffs. If not, libxdp will fall back to loading a single program without a dispatcher, as if the kernel did not support the features needed for multiprog attachment. .SH "Using AF_XDP sockets" .PP Libxdp implements helper functions for configuring AF_XDP sockets as well as reading and writing packets from these sockets. AF_XDP sockets can be used to redirect packets to user-space at high rates from an XDP program. Note that this functionality used to reside in libbpf, but has now been moved over to libxdp as it is a better fit for this library. As of the 1.0 release of libbpf, the AF_XDP socket support will be removed and all future development will be performed in libxdp instead. .PP For an overview of AF_XDP sockets, please refer to this Linux Plumbers paper (\fIhttp://vger.kernel.org/lpc_net2018_talks/lpc18_pres_af_xdp_perf-v3.pdf\fP) and the documentation in the Linux kernel (Documentation/networking/af_xdp.rst or \fIhttps://www.kernel.org/doc/html/latest/networking/af_xdp.html\fP). .PP For an example on how to use the interface, take a look at the AF_XDP-example and AF_XDP-forwarding programs in the bpf-examples repository: \fIhttps://github.com/xdp-project/bpf-examples\fP. .SS "Control path" .PP Libxdp provides helper functions for creating and destroying umems and sockets as shown below. The first thing that a user generally wants to do is to create a umem area. This is the area that will contain all packets received and the ones that are going to be sent. After that, AF_XDP sockets can be created tied to this umem. These can either be sockets that have exclusive ownership of that umem through xsk_socket__create() or shared with other sockets using xsk_socket__create_shared. There is one option called XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD that can be set in the libxdp_flags field (also called libbpf_flags for compatibility reasons). This will make libxdp not load any XDP program or set and BPF maps which is a must if users want to add their own XDP program. .PP If there is already a socket created with socket(AF_XDP, SOCK_RAW, 0) not bound and not tied to any umem, file descriptor of this socket can be used in param opts of xsk_umem__create_opts(), which is a recommended way of umem creation. .RS .nf \fCstruct xsk_umem *xsk_umem__create_opts(void *umem_area, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, struct xsk_umem_opts *opts); int xsk_umem__create(struct xsk_umem **umem, void *umem_area, __u64 size, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, const struct xsk_umem_config *config); int xsk_umem__create_with_fd(struct xsk_umem **umem, int fd, void *umem_area, __u64 size, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, const struct xsk_umem_config *config); int xsk_socket__create(struct xsk_socket **xsk, const char *ifname, __u32 queue_id, struct xsk_umem *umem, struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, const struct xsk_socket_config *config); int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, const char *ifname, __u32 queue_id, struct xsk_umem *umem, struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, const struct xsk_socket_config *config); int xsk_umem__delete(struct xsk_umem *umem); void xsk_socket__delete(struct xsk_socket *xsk); \fP .fi .RE .PP There are also two helper function to get the file descriptor of a umem or a socket. These are needed when using standard Linux syscalls such as poll(), recvmsg(), sendto(), etc. .RS .nf \fCint xsk_umem__fd(const struct xsk_umem *umem); int xsk_socket__fd(const struct xsk_socket *xsk); \fP .fi .RE .PP The control path also provides two APIs for setting up AF_XDP sockets when the process that is going to use the AF_XDP socket is non-privileged. These two functions perform the operations that require privileges and can be executed from some form of control process that has the necessary privileges. The xsk_socket__create executed on the non-privileged process will then skip these two steps. For an example on how to use these, please take a look at the AF_XDP-example program in the bpf-examples repository: \fIhttps://github.com/xdp-project/bpf-examples/tree/main/AF_XDP-example\fP. .RS .nf \fCint xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd); int xsk_socket__update_xskmap(struct xsk_socket *xsk, int xsks_map_fd); \fP .fi .RE .PP To further reduce required level of privileges, an AF_XDP socket can be created beforehand with socket(AF_XDP, SOCK_RAW, 0) and passed to a non-privileged process. This socket can be used in xsk_umem__create_opts() and later in xsk_socket__create() with created umem. xsk_socket__create_shared() would still require privileges for AF_XDP socket creation. .SS "Data path" .PP For performance reasons, all the data path functions are static inline functions found in the xsk.h header file so they can be optimized into the target application binary for best possible performance. There are four FIFO rings of two main types: producer rings (fill and Tx) and consumer rings (Rx and completion). The producer rings use xsk_ring_prod functions and consumer rings use xsk_ring_cons functions. For producer rings, you start with \fIreserving\fP one or more slots in a producer ring and then when they have been filled out, you \fIsubmit\fP them so that the kernel will act on them. For a consumer ring, you \fIpeek\fP if there are any new packets in the ring and if so you can read them from the ring. Once you are done reading them, you \fIrelease\fP them back to the kernel so it can use them for new packets. There is also a \fIcancel\fP operation for consumer rings if the application does not want to consume all packets received with the peek operation. .RS .nf \fC__u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, __u32 *idx); void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb); __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx); void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, __u32 nb); void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb); \fP .fi .RE .PP The functions below are used for reading and writing the descriptors of the rings. xsk_ring_prod__fill_addr() and xsk_ring_prod__tx_desc() \fBwrites\fP entries in the fill and Tx rings respectively, while xsk_ring_cons__comp_addr and xsk_ring_cons__rx_desc \fBreads\fP entries from the completion and Rx rings respectively. The \fIidx\fP is the parameter returned in the xsk_ring_prod__reserve or xsk_ring_cons__peek calls. To advance to the next entry, simply do \fIidx++\fP. .RS .nf \fC__u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill, __u32 idx); struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx, __u32 idx); const __u64 *xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx); const struct xdp_desc *xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx); \fP .fi .RE .PP The xsk_umem functions are used to get a pointer to the packet data itself, always located inside the umem. In the default aligned mode, you can get the addr variable straight from the Rx descriptor. But in unaligned mode, you need to use the three last function below as the offset used is carried in the upper 16 bits of the addr. Therefore, you cannot use the addr straight from the descriptor in the unaligned case. .RS .nf \fCvoid *xsk_umem__get_data(void *umem_area, __u64 addr); __u64 xsk_umem__extract_addr(__u64 addr); __u64 xsk_umem__extract_offset(__u64 addr); __u64 xsk_umem__add_offset_to_addr(__u64 addr); \fP .fi .RE .PP There is one more function in the data path and that checks if the need_wakeup flag is set. Use of this flag is highly encouraged and should be enabled by setting \fIXDP_USE_NEED_WAKEUP\fP bit in the \fIxdp_bind_flags\fP field that is provided to the xsk_socket_create_[shared]() calls. If this function returns true, then you need to call \fIrecvmsg()\fP, \fIsendto()\fP, or \fIpoll()\fP depending on the situation. \fIrecvmsg()\fP if you are \fBreceiving\fP, or \fIsendto()\fP if you are \fBsending\fP. \fIpoll()\fP can be used for both cases and provide the ability to sleep too, as with any other socket. But note that poll is a slower operation than the other two. .RS .nf \fCint xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r); \fP .fi .RE .PP For an example on how to use all these APIs, take a look at the AF_XDP-example and AF_XDP-forwarding programs in the bpf-examples repository: \fIhttps://github.com/xdp-project/bpf-examples\fP. .SH "Kernel and BPF program feature compatibility" .PP The features exposed by libxdp relies on certain kernel versions and BPF features to work. To get the full benefit of all features, libxdp needs to be used with kernel 5.10 or newer, unless the commits mentioned below have been backported. However, libxdp will probe the kernel and transparently fall back to legacy loading procedures, so it is possible to use the library with older versions, although some features will be unavailable, as detailed below. .PP The ability to attach multiple BPF programs to a single interface relies on the kernel "BPF program extension" feature which was introduced by commit be8704ff07d2 ("bpf: Introduce dynamic program extensions") in the upstream kernel and first appeared in kernel release 5.6. To \fBincrementally\fP attach multiple programs, a further refinement added by commit 4a1e7c0c63e0 ("bpf: Support attaching freplace programs to multiple attach points") is needed; this first appeared in the upstream kernel version 5.10. The functionality relies on the "BPF trampolines" feature which is available on architectures that support it. .PP The support matrix is as follows: .IP \(em 4 Kernels before 5.6 can only attach a single XDP program to each interface .IP \(em 4 Kernels 5.6+ can attach multiple programs if they are all attached at the same time .IP \(em 4 Kernels 5.10+ have full support for XDP multiprog on architectures supporting BPF trampolines .IP \(em 4 On architectures without BPF trampoline support, only a single program can be attached to each interface .PP To load AF_XDP programs, kernel support for AF_XDP sockets needs to be included and enabled in the kernel build. In addition, when using AF_XDP sockets, an XDP program is also loaded on the interface. The XDP program used for this by libxdp requires the ability to do map lookups into XSK maps, which was introduced with commit fada7fdc83c0 ("bpf: Allow bpf_map_lookup_elem() on an xskmap") in kernel 5.3. This means that the minimum required kernel version for using AF_XDP is kernel 5.3; however, for the AF_XDP XDP program to co-exist with other programs, the same constraints for multiprog applies as outlined above. .PP Note that some Linux distributions backport features to earlier kernel versions, especially in enterprise kernels; for instance, Red Hat Enterprise Linux kernels include everything needed for libxdp to function since RHEL 8.5. .PP Finally, XDP programs loaded using the multiprog facility must include type information (using the BPF Type Format, BTF). To get this, compile the programs with a recent version of Clang/LLVM (version 10+), and enable debug information when compiling (using the \fI\-g\fP option). .SH "BUGS" .PP Please report any bugs on Github: \fIhttps://github.com/xdp-project/xdp-tools/issues\fP .SH "AUTHORS" .PP libxdp and this man page were written by Toke Høiland-Jørgensen. AF_XDP support and documentation was contributed by Magnus Karlsson. xdp-tools-1.6.1/lib/libxdp/libxdp.c000066400000000000000000002536161514310632100171370ustar00rootroot00000000000000// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) /* * XDP management utility functions * * Copyright (C) 2020 Toke Høiland-Jørgensen */ #include #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* ERR_PTR */ #include #include #include #include #include #include #include #include "compat.h" #include "libxdp_internal.h" #define XDP_RUN_CONFIG_SEC ".xdp_run_config" #define XDP_SKIP_ENVVAR "LIBXDP_SKIP_DISPATCHER" /* When cloning BPF fds, we want to make sure they don't end up as any of the * standard stdin, stderr, stdout descriptors: fd 0 can confuse the kernel, and * there are orchestration systems that will force-close the others if they * don't point to the "right" things. So just to be safe, use 3 as the minimum * fd number. */ #define MIN_FD 3 /* Max number of times we retry attachment */ #define MAX_RETRY 10 #define IFINDEX_LO 1 static const char *dispatcher_feature_err = "This means that the kernel does not support the features needed\n" "by the multiprog dispatcher, either because it is too old entirely,\n" "or because it is not yet supported on the current architecture.\n"; struct xdp_program { /* one of prog or prog_fd should be set */ struct bpf_program *bpf_prog; struct bpf_object *bpf_obj; struct btf *btf; enum bpf_prog_type prog_type; int prog_fd; int link_fd; char *prog_name; char *attach_name; __u8 prog_tag[BPF_TAG_SIZE]; __u32 prog_id; __u64 load_time; bool from_external_obj; bool is_frags; unsigned int run_prio; unsigned int chain_call_actions; /* bitmap */ /* for building list of attached programs to multiprog */ struct xdp_program *next; }; struct xdp_multiprog { struct xdp_dispatcher_config config; struct xdp_program *main_prog; /* dispatcher or legacy prog pointer */ struct xdp_program *first_prog; /* uses xdp_program->next to build a list */ struct xdp_program *hw_prog; __u32 version; size_t num_links; bool is_loaded; bool is_legacy; bool kernel_frags_support; bool kernel_devbound_support; bool checked_compat; enum xdp_attach_mode attach_mode; int ifindex; }; #define XDP_DISPATCHER_VERSION_V1 1 struct xdp_dispatcher_config_v1 { __u8 num_progs_enabled; /* Number of active program slots */ __u32 chain_call_actions[MAX_DISPATCHER_ACTIONS]; __u32 run_prios[MAX_DISPATCHER_ACTIONS]; }; #define XDP_DISPATCHER_VERSION_V2 2 struct xdp_dispatcher_config_v2 { __u8 magic; /* Set to XDP_DISPATCHER_MAGIC */ __u8 dispatcher_version; /* Set to XDP_DISPATCHER_VERSION */ __u8 num_progs_enabled; /* Number of active program slots */ __u8 is_xdp_frags; /* Whether this dispatcher is loaded with XDP frags support */ __u32 chain_call_actions[MAX_DISPATCHER_ACTIONS]; __u32 run_prios[MAX_DISPATCHER_ACTIONS]; __u32 program_flags[MAX_DISPATCHER_ACTIONS]; }; static const char *xdp_action_names[] = { [XDP_ABORTED] = "XDP_ABORTED", [XDP_DROP] = "XDP_DROP", [XDP_PASS] = "XDP_PASS", [XDP_TX] = "XDP_TX", [XDP_REDIRECT] = "XDP_REDIRECT", }; static struct xdp_program *xdp_program__create_from_obj(struct bpf_object *obj, const char *section_name, const char *prog_name, bool external); #ifdef LIBXDP_STATIC struct xdp_embedded_obj { const char *filename; const void *data_start; const void *data_end; }; extern const char _binary_xdp_dispatcher_o_start; extern const char _binary_xdp_dispatcher_o_end; extern const char _binary_xsk_def_xdp_prog_o_start; extern const char _binary_xsk_def_xdp_prog_o_end; extern const char _binary_xsk_def_xdp_prog_5_3_o_start; extern const char _binary_xsk_def_xdp_prog_5_3_o_end; static struct xdp_embedded_obj embedded_objs[] = { {"xdp-dispatcher.o", &_binary_xdp_dispatcher_o_start, &_binary_xdp_dispatcher_o_end}, {"xsk_def_xdp_prog.o", &_binary_xsk_def_xdp_prog_o_start, &_binary_xsk_def_xdp_prog_o_end}, {"xsk_def_xdp_prog_5.3.o", &_binary_xsk_def_xdp_prog_5_3_o_start, &_binary_xsk_def_xdp_prog_5_3_o_end}, {}, }; static struct xdp_program *xdp_program__find_embedded(const char *filename, const char *section_name, const char *prog_name, struct bpf_object_open_opts *opts) { DECLARE_LIBBPF_OPTS(bpf_object_open_opts, default_opts, .object_name = filename, ); struct xdp_embedded_obj *eobj; struct bpf_object *obj; size_t size; int err; for (eobj = &embedded_objs[0]; eobj->filename; eobj++) { if (strcmp(filename, eobj->filename)) continue; size = eobj->data_end - eobj->data_start; /* set the object name to the same as if we opened the file from * the filesystem */ if (!opts) opts = &default_opts; else if (!opts->object_name) opts->object_name = filename; pr_debug("Loading XDP program '%s' from embedded object file\n", filename); obj = bpf_object__open_mem(eobj->data_start, size, opts); err = libbpf_get_error(obj); if (err) return ERR_PTR(err); return xdp_program__create_from_obj(obj, section_name, prog_name, false); } return NULL; } #else static inline struct xdp_program *xdp_program__find_embedded(__unused const char *filename, __unused const char *section_name, __unused const char *prog_name, __unused struct bpf_object_open_opts *opts) { return NULL; } #endif static int __base_pr(enum libxdp_print_level level, const char *format, va_list args) { if (level == LIBXDP_DEBUG) return 0; return vfprintf(stderr, format, args); } static libxdp_print_fn_t __libxdp_pr = __base_pr; libxdp_print_fn_t libxdp_set_print(libxdp_print_fn_t fn) { libxdp_print_fn_t old_print_fn = __libxdp_pr; __libxdp_pr = fn; return old_print_fn; } __printf(2, 3) void libxdp_print(enum libxdp_print_level level, const char *format, ...) { va_list args; if (!__libxdp_pr) return; va_start(args, format); __libxdp_pr(level, format, args); va_end(args); } static enum { COMPAT_UNKNOWN, COMPAT_SUPPORTED, COMPAT_UNSUPPORTED } kernel_compat = COMPAT_UNKNOWN; static int xdp_multiprog__attach(struct xdp_multiprog *old_mp, struct xdp_multiprog *mp, enum xdp_attach_mode mode); static struct xdp_multiprog *xdp_multiprog__generate(struct xdp_program **progs, size_t num_progs, int ifindex, struct xdp_multiprog *old_mp, bool remove_progs, unsigned int flags); static int xdp_multiprog__pin(struct xdp_multiprog *mp); static int xdp_multiprog__unpin(struct xdp_multiprog *mp); /* On NULL, libxdp always sets errno to 0 for old APIs, so that their * compatibility is maintained wrt old libxdp_get_error that called the older * version of libbpf_get_error which did PTR_ERR_OR_ZERO, but newer versions * unconditionally return -errno on seeing NULL, as the libbpf practice changed * to returning NULL or errors. * * The new APIs (like xdp_program__create) which indicate error using NULL set * their errno when returning NULL. */ long libxdp_get_error(const void *ptr) { if (!IS_ERR_OR_NULL(ptr)) return 0; if (IS_ERR(ptr)) errno = -PTR_ERR(ptr); return -errno; } int libxdp_strerror(int err, char *buf, size_t size) { return libxdp_err(libbpf_strerror(err, buf, size)); } static char *libxdp_strerror_r(int err, char *dst, size_t size) { int ret = libxdp_strerror(err, dst, size); if (ret) snprintf(dst, size, "ERROR: strerror_r(%d)=%d", err, ret); return dst; } #ifndef HAVE_LIBBPF_BTF__LOAD_FROM_KERNEL_BY_ID static struct btf *btf__load_from_kernel_by_id(__u32 id) { struct btf *btf; int err; err = btf__get_from_id(id, &btf); if (err) return NULL; return btf; } #endif #ifndef HAVE_LIBBPF_BTF__TYPE_CNT static __u32 btf__type_cnt(const struct btf *btf) { /* old function didn't include 'void' type in count */ return btf__get_nr_types(btf) + 1; } #endif #ifndef HAVE_LIBBPF_BPF_OBJECT__NEXT_MAP static struct bpf_map *bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *map) { return bpf_map__next(map, obj); } #endif #ifndef HAVE_LIBBPF_BPF_OBJECT__NEXT_PROGRAM static struct bpf_program *bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prog) { return bpf_program__next(prog, obj); } #endif #ifndef HAVE_LIBBPF_BPF_PROGRAM__INSN_CNT #define BPF_INSN_SZ (sizeof(struct bpf_insn)) static size_t bpf_program__insn_cnt(const struct bpf_program *prog) { size_t sz; sz = bpf_program__size(prog); return sz / BPF_INSN_SZ; } #endif #ifndef HAVE_LIBBPF_BPF_PROGRAM__TYPE static inline enum bpf_prog_type bpf_program__type(const struct bpf_program *prog) { return bpf_program__get_type((struct bpf_program *)prog); } #endif #ifndef HAVE_LIBBPF_BPF_PROGRAM__FLAGS static __u32 bpf_program__flags(__unused const struct bpf_program *prog) { /* When libbpf doesn't support this we can't get the real value. * Returning 0 works because the callers check for the presence of a * specific flag (BPF_F_XDP_HAS_FRAGS), and having it always-off * disables the frags functionality which is what we want. */ return 0; } #endif /* This function has been deprecated in libbpf, but we expose an API that uses * section names, so we reimplement it to keep compatibility */ static struct bpf_program * bpf_program_by_section_name(const struct bpf_object *obj, const char *section_name) { struct bpf_program *pos; const char *sname; bpf_object__for_each_program(pos, obj) { sname = bpf_program__section_name(pos); if (sname && !strcmp(sname, section_name)) return pos; } return NULL; } static bool bpf_is_valid_mntpt(const char *mnt) { struct statfs st_fs; if (statfs(mnt, &st_fs) < 0) return false; if ((unsigned long)st_fs.f_type != BPF_FS_MAGIC) return false; return true; } static int bpf_mnt_fs(const char *target) { bool bind_done = false; int err; retry: err = mount("", target, "none", MS_PRIVATE | MS_REC, NULL); if (err) { if (errno != EINVAL || bind_done) { err = -errno; pr_warn("mount --make-private %s failed: %s\n", target, strerror(-err)); return err; } err = mount(target, target, "none", MS_BIND, NULL); if (err) { err = -errno; pr_warn("mount --bind %s %s failed: %s\n", target, target, strerror(-err)); return err; } bind_done = true; goto retry; } err = mount("bpf", target, "bpf", 0, "mode=0700"); if (err) { err = -errno; pr_warn("mount -t bpf bpf %s failed: %s\n", target, strerror(-err)); return err; } return 0; } static const char *bpf_find_mntpt_single(char *mnt, int len, const char *mntpt, bool mount) { int err; if (!bpf_is_valid_mntpt(mntpt)) { if (!mount) return NULL; pr_debug("No bpffs found at %s, mounting a new one\n", mntpt); err = bpf_mnt_fs(mntpt); if (err) return NULL; } strncpy(mnt, mntpt, len - 1); mnt[len - 1] = '\0'; return mnt; } static const char *find_bpffs() { static bool bpf_mnt_cached = false; static char bpf_wrk_dir[PATH_MAX]; static const char *mnt = NULL; char *envdir, *envval; bool mount = false; if (bpf_mnt_cached) return mnt; envdir = secure_getenv(XDP_BPFFS_ENVVAR); envval = secure_getenv(XDP_BPFFS_MOUNT_ENVVAR); if (envval && envval[0] == '1' && envval[1] == '\0') mount = true; mnt = bpf_find_mntpt_single(bpf_wrk_dir, sizeof(bpf_wrk_dir), envdir ?: BPF_DIR_MNT, mount); if (!mnt) pr_warn("No bpffs found at %s\n", envdir ?: BPF_DIR_MNT); else bpf_mnt_cached = 1; return mnt; } static int mk_state_subdir(char *dir, size_t dir_sz, const char *parent) { int err; err = try_snprintf(dir, dir_sz, "%s/xdp", parent); if (err) return err; err = mkdir(dir, S_IRWXU); if (err && errno != EEXIST) return -errno; return 0; } static const char *get_bpffs_dir(void) { static char bpffs_dir[PATH_MAX]; static const char *dir = NULL; const char *parent; int err; if (dir) return dir; parent = find_bpffs(); if (!parent) { err = -ENOENT; goto err; } err = mk_state_subdir(bpffs_dir, sizeof(bpffs_dir), parent); if (err) goto err; dir = bpffs_dir; return dir; err: return ERR_PTR(err); } static const char *get_lock_dir(void) { static const char *dir = NULL; static char rundir[PATH_MAX]; int err; if (dir) return dir; dir = get_bpffs_dir(); if (!IS_ERR(dir)) return dir; err = mk_state_subdir(rundir, sizeof(rundir), RUNDIR); if (err) return ERR_PTR(err); dir = rundir; return dir; } int xdp_lock_acquire(void) { int lock_fd, err; const char *dir; dir = get_lock_dir(); if (IS_ERR(dir)) return PTR_ERR(dir); lock_fd = open(dir, O_DIRECTORY); if (lock_fd < 0) { err = -errno; pr_warn("Couldn't open lock directory at %s: %s\n", dir, strerror(-err)); return err; } err = flock(lock_fd, LOCK_EX); if (err) { err = -errno; pr_warn("Couldn't flock fd %d: %s\n", lock_fd, strerror(-err)); close(lock_fd); return err; } pr_debug("Acquired lock from %s with fd %d\n", dir, lock_fd); return lock_fd; } int xdp_lock_release(int lock_fd) { int err; err = flock(lock_fd, LOCK_UN); if (err) { err = -errno; pr_warn("Couldn't unlock fd %d: %s\n", lock_fd, strerror(-err)); } else { pr_debug("Released lock fd %d\n", lock_fd); } close(lock_fd); return err; } static int do_xdp_attach(int ifindex, int prog_fd, int old_fd, __u32 xdp_flags) { #ifdef HAVE_LIBBPF_BPF_XDP_ATTACH LIBBPF_OPTS(bpf_xdp_attach_opts, opts, .old_prog_fd = old_fd); return bpf_xdp_attach(ifindex, prog_fd, xdp_flags, &opts); #else DECLARE_LIBBPF_OPTS(bpf_xdp_set_link_opts, opts, .old_fd = old_fd); return bpf_set_link_xdp_fd_opts(ifindex, prog_fd, xdp_flags, old_fd ? &opts : NULL); #endif } int xdp_attach_fd(int prog_fd, int old_fd, int ifindex, enum xdp_attach_mode mode) { int err = 0, xdp_flags = 0; pr_debug("Replacing XDP fd %d with %d on ifindex %d\n", old_fd, prog_fd, ifindex); if (old_fd == -1) { xdp_flags |= XDP_FLAGS_UPDATE_IF_NOEXIST; old_fd = 0; } switch (mode) { case XDP_MODE_SKB: xdp_flags |= XDP_FLAGS_SKB_MODE; break; case XDP_MODE_NATIVE: xdp_flags |= XDP_FLAGS_DRV_MODE; break; case XDP_MODE_HW: xdp_flags |= XDP_FLAGS_HW_MODE; break; case XDP_MODE_UNSPEC: break; } again: err = do_xdp_attach(ifindex, prog_fd, old_fd, xdp_flags); if (err < 0) { if (err == -EINVAL && old_fd) { pr_debug("Got 'invalid argument', trying again without old_fd\n"); old_fd = 0; goto again; } pr_info("Error attaching XDP program to ifindex %d: %s\n", ifindex, strerror(-err)); if (err == -EEXIST && old_fd) /* We raced with another attach/detach, have to retry */ return -EAGAIN; switch (-err) { case EBUSY: case EEXIST: pr_info("XDP already loaded on device\n"); break; case EOPNOTSUPP: pr_info("XDP mode not supported; try using SKB mode\n"); break; default: break; } } return err; } const struct btf *xdp_program__btf(struct xdp_program *xdp_prog) { if (!xdp_prog) return libxdp_err_ptr(0, true); return xdp_prog->btf; } enum xdp_attach_mode xdp_program__is_attached(const struct xdp_program *xdp_prog, int ifindex) { struct xdp_program *prog = NULL; struct xdp_multiprog *mp; enum xdp_attach_mode ret = XDP_MODE_UNSPEC; if (!xdp_prog || !xdp_prog->prog_id) return ret; mp = xdp_multiprog__get_from_ifindex(ifindex); if (IS_ERR_OR_NULL(mp)) return ret; prog = xdp_multiprog__hw_prog(mp); if (xdp_program__id(prog) == xdp_program__id(xdp_prog)) { ret = XDP_MODE_HW; goto out; } if (xdp_multiprog__is_legacy(mp)) { prog = xdp_multiprog__main_prog(mp); if (xdp_program__id(prog) == xdp_program__id(xdp_prog)) ret = xdp_multiprog__attach_mode(mp); goto out; } while ((prog = xdp_multiprog__next_prog(prog, mp))) { if (xdp_program__id(prog) == xdp_program__id(xdp_prog)) { ret = xdp_multiprog__attach_mode(mp); break; } } out: xdp_multiprog__close(mp); return ret; } int xdp_program__set_chain_call_enabled(struct xdp_program *prog, unsigned int action, bool enabled) { if (IS_ERR_OR_NULL(prog) || prog->prog_fd >= 0 || action >= XDP_DISPATCHER_RETVAL) return libxdp_err(-EINVAL); if (enabled) prog->chain_call_actions |= (1U << action); else prog->chain_call_actions &= ~(1U << action); return 0; } bool xdp_program__chain_call_enabled(const struct xdp_program *prog, enum xdp_action action) { if (IS_ERR_OR_NULL(prog) || action >= XDP_DISPATCHER_RETVAL) return false; return !!(prog->chain_call_actions & (1U << action)); } unsigned int xdp_program__run_prio(const struct xdp_program *prog) { if (IS_ERR_OR_NULL(prog)) return XDP_DEFAULT_RUN_PRIO; return prog->run_prio; } int xdp_program__set_run_prio(struct xdp_program *prog, unsigned int run_prio) { if (IS_ERR_OR_NULL(prog) || prog->prog_fd >= 0) return libxdp_err(-EINVAL); prog->run_prio = run_prio; return 0; } bool xdp_program__xdp_frags_support(const struct xdp_program *prog) { if (IS_ERR_OR_NULL(prog)) return false; /* Until we load the program we just check the bpf_program__flags() to * ensure any changes made to those are honoured on the libxdp side. For * loaded programs we keep our own state variable which is populated * either by copying over the program flags in xdp_program__load(), or * by loading the state from the dispatcher state variables if * instantiating the object from the kernel. */ if (!prog->bpf_prog || prog->prog_fd >= 0) return prog->is_frags; return !!(bpf_program__flags(prog->bpf_prog) & BPF_F_XDP_HAS_FRAGS); } #ifndef HAVE_LIBBPF_BPF_PROGRAM__FLAGS int xdp_program__set_xdp_frags_support(__unused struct xdp_program *prog, __unused bool frags) { return libxdp_err(-EOPNOTSUPP); } #else int xdp_program__set_xdp_frags_support(struct xdp_program *prog, bool frags) { __u32 prog_flags; int ret; if (IS_ERR_OR_NULL(prog) || !prog->bpf_prog || prog->prog_fd >= 0) return libxdp_err(-EINVAL); prog_flags = bpf_program__flags(prog->bpf_prog); if (frags) prog_flags |= BPF_F_XDP_HAS_FRAGS; else prog_flags &= ~BPF_F_XDP_HAS_FRAGS; ret = bpf_program__set_flags(prog->bpf_prog, prog_flags); if (!ret) prog->is_frags = frags; return ret; } #endif // HAVE_LIBBPF_BPF_PROGRAM__FLAGS const char *xdp_program__name(const struct xdp_program *prog) { if (IS_ERR_OR_NULL(prog)) return libxdp_err_ptr(0, true); return prog->prog_name; } struct bpf_object *xdp_program__bpf_obj(struct xdp_program *prog) { if (IS_ERR_OR_NULL(prog)) return libxdp_err_ptr(0, true); return prog->bpf_obj; } const unsigned char *xdp_program__tag(const struct xdp_program *prog) { if (IS_ERR_OR_NULL(prog)) return libxdp_err_ptr(0, true); return prog->prog_tag; } uint32_t xdp_program__id(const struct xdp_program *prog) { if (IS_ERR_OR_NULL(prog)) return 0; return prog->prog_id; } int xdp_program__fd(const struct xdp_program *prog) { if (IS_ERR_OR_NULL(prog)) return errno = ENOENT, -1; return prog->prog_fd; } int xdp_program__print_chain_call_actions(const struct xdp_program *prog, char *buf, size_t buf_len) { bool first = true; char *pos = buf; int i, len = 0; if (IS_ERR_OR_NULL(prog) || !buf || !buf_len) return libxdp_err(-EINVAL); for (i = 0; i <= XDP_REDIRECT; i++) { if (xdp_program__chain_call_enabled(prog, i)) { if (!first) { if (!buf_len) goto err_len; *pos++ = ','; buf_len--; } else { first = false; } len = snprintf(pos, buf_len, "%s", xdp_action_names[i]); if (len < 0 || (size_t)len >= buf_len) goto err_len; pos += len; buf_len -= len; } } return 0; err_len: *pos = '\0'; return libxdp_err(-ENOSPC); } static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id) { const struct btf_type *t = btf__type_by_id(btf, id); if (res_id) *res_id = id; while (btf_is_mod(t) || btf_is_typedef(t)) { if (res_id) *res_id = t->type; t = btf__type_by_id(btf, t->type); } return t; } static bool get_field_int(const struct btf *btf, const char *t_name, const struct btf_type *t, __u32 *res) { const struct btf_array *arr_info; const struct btf_type *arr_t; if (!btf_is_ptr(t)) { pr_warn("attr '%s': expected PTR, got %u.\n", t_name, btf_kind(t)); return false; } arr_t = btf__type_by_id(btf, t->type); if (!arr_t) { pr_warn("attr '%s': type [%u] not found.\n", t_name, t->type); return false; } if (!btf_is_array(arr_t)) { pr_warn("attr '%s': expected ARRAY, got %u.\n", t_name, btf_kind(arr_t)); return false; } arr_info = btf_array(arr_t); *res = arr_info->nelems; return true; } static bool get_xdp_action(const char *act_name, unsigned int *act) { const char **name = xdp_action_names; unsigned int i; for (i = 0; i < ARRAY_SIZE(xdp_action_names); i++, name++) { if (!strcmp(act_name, *name)) { *act = i; return true; } } return false; } /* * Find BTF func definition for func_name, which may be a truncated prefix of * the real function name. * Return NULL on no, or ambiguous, match. */ static const struct btf_type *btf_get_function(const struct btf *btf, const char *func_name) { const struct btf_type *t, *match; size_t len, matches = 0; const char *name; int nr_types, i; if (!btf) { pr_debug("No BTF found for program\n"); return NULL; } len = strlen(func_name); nr_types = btf__type_cnt(btf); for (i = 1; i < nr_types; i++) { t = btf__type_by_id(btf, i); if (!btf_is_func(t)) continue; name = btf__name_by_offset(btf, t->name_off); if (!strncmp(name, func_name, len)) { pr_debug("Found func %s matching %s\n", name, func_name); if (strlen(name) == len) return t; /* exact match */ /* prefix, may not be unique */ matches++; match = t; } } if (matches == 1) /* unique match */ return match; pr_debug("Function '%s' not found or ambiguous (%zu matches).\n", func_name, matches); return NULL; } static const struct btf_type *btf_get_datasec(const struct btf *btf, const char *sec_name) { const struct btf_type *t; int nr_types, i; const char *name; if (!btf) { pr_debug("No BTF found for program\n"); return NULL; } nr_types = btf__type_cnt(btf); for (i = 1; i < nr_types; i++) { t = btf__type_by_id(btf, i); if (!btf_is_datasec(t)) continue; name = btf__name_by_offset(btf, t->name_off); if (strcmp(name, sec_name) == 0) return t; } pr_debug("DATASEC '%s' not found.\n", sec_name); return NULL; } static const struct btf_type *btf_get_section_var(const struct btf *btf, const struct btf_type *sec, const char *var_name, __u16 kind) { const struct btf_var_secinfo *vi; const struct btf_var *var_extra; const struct btf_type *var, *def; const char *name; int vlen, i; vlen = btf_vlen(sec); vi = btf_var_secinfos(sec); for (i = 0; i < vlen; i++, vi++) { var = btf__type_by_id(btf, vi->type); var_extra = btf_var(var); name = btf__name_by_offset(btf, var->name_off); if (strcmp(name, var_name)) continue; if (!btf_is_var(var)) { pr_warn("struct '%s': unexpected var kind %u.\n", name, btf_kind(var)); return ERR_PTR(-EINVAL); } if (var_extra->linkage != BTF_VAR_GLOBAL_ALLOCATED && var_extra->linkage != BTF_VAR_STATIC) { pr_warn("struct '%s': unsupported var linkage %u.\n", name, var_extra->linkage); return ERR_PTR(-EOPNOTSUPP); } def = skip_mods_and_typedefs(btf, var->type, NULL); if (btf_kind(def) != kind) { pr_warn("var '%s': unexpected def kind %u.\n", name, btf_kind(def)); return ERR_PTR(-EINVAL); } return def; } return ERR_PTR(-ENOENT); } /** * This function parses the run config information attached to an XDP program. * * This information is specified using BTF, in a format similar to how * BTF-defined maps are done. The definition looks like this: * * struct { * __uint(priority, 10); * __uint(XDP_PASS, 1); * } XDP_RUN_CONFIG(FUNCNAME); * * The priority is simply an integer that will be used to sort programs as they * are attached on the interface (see cmp_xdp_programs() for full sort order). * In addition to the priority, the run config can define an integer value for * each XDP action. A non-zero value means that execution will continue to the * next loaded program if the current program returns that action. I.e., in the * above example, any return value other than XDP_PASS will cause the dispatcher * to exit with that return code, whereas XDP_PASS means execution will * continue. * * Since this information becomes part of the object file BTF info, it will * survive loading into the kernel, and so it can be retrieved for * already-loaded programs as well. */ static int xdp_program__parse_btf(struct xdp_program *xdp_prog, const struct btf *btf) { const struct btf_type *def, *sec; const struct btf_member *m; char struct_name[100]; int err, i, mlen; if (!btf) btf = xdp_program__btf(xdp_prog); /* If the program name is the maximum allowed object name in the kernel, * it may have been truncated, in which case we try to expand it by * looking for a match in the BTF data. */ if (strlen(xdp_prog->prog_name) >= BPF_OBJ_NAME_LEN - 1) { const struct btf_type *func; char *name; func = btf_get_function(btf, xdp_prog->prog_name); if (func) { name = strdup(btf__name_by_offset(btf, func->name_off)); if (!name) return -ENOMEM; free(xdp_prog->prog_name); xdp_prog->prog_name = name; } } err = try_snprintf(struct_name, sizeof(struct_name), "_%s", xdp_program__name(xdp_prog)); if (err) return err; sec = btf_get_datasec(btf, XDP_RUN_CONFIG_SEC); if (!sec) return -ENOENT; def = btf_get_section_var(btf, sec, struct_name, BTF_KIND_STRUCT); if (IS_ERR(def)) { pr_debug("Couldn't find run order struct %s\n", struct_name); return PTR_ERR(def); } mlen = btf_vlen(def); m = btf_members(def); for (i = 0; i < mlen; i++, m++) { const char *mname = btf__name_by_offset(btf, m->name_off); const struct btf_type *m_t; unsigned int val, act; if (!mname) { pr_warn("struct '%s': invalid field #%d.\n", struct_name, i); return -EINVAL; } m_t = skip_mods_and_typedefs(btf, m->type, NULL); if (!strcmp(mname, "priority")) { if (!get_field_int(btf, mname, m_t, &xdp_prog->run_prio)) return -EINVAL; continue; } else if (get_xdp_action(mname, &act)) { if (!get_field_int(btf, mname, m_t, &val)) return -EINVAL; xdp_program__set_chain_call_enabled(xdp_prog, act, val); } else { pr_warn("Invalid mname: %s\n", mname); return -ENOTSUP; } } return 0; } static struct xdp_program *xdp_program__new(void) { struct xdp_program *xdp_prog; xdp_prog = malloc(sizeof(*xdp_prog)); if (!xdp_prog) return ERR_PTR(-ENOMEM); memset(xdp_prog, 0, sizeof(*xdp_prog)); xdp_prog->prog_fd = -1; xdp_prog->link_fd = -1; xdp_prog->run_prio = XDP_DEFAULT_RUN_PRIO; xdp_prog->chain_call_actions = XDP_DEFAULT_CHAIN_CALL_ACTIONS; return xdp_prog; } void xdp_program__close(struct xdp_program *xdp_prog) { if (!xdp_prog) return; if (xdp_prog->link_fd >= 0) close(xdp_prog->link_fd); if (xdp_prog->prog_fd >= 0) close(xdp_prog->prog_fd); free(xdp_prog->prog_name); free(xdp_prog->attach_name); if (!xdp_prog->from_external_obj) { if (xdp_prog->bpf_obj) bpf_object__close(xdp_prog->bpf_obj); else if (xdp_prog->btf) btf__free(xdp_prog->btf); } free(xdp_prog); } static struct xdp_program *xdp_program__create_from_obj(struct bpf_object *obj, const char *section_name, const char *prog_name, bool external) { struct xdp_program *xdp_prog; struct bpf_program *bpf_prog; int err; if (!obj || (section_name && prog_name)) return ERR_PTR(-EINVAL); if (section_name) bpf_prog = bpf_program_by_section_name(obj, section_name); else if (prog_name) bpf_prog = bpf_object__find_program_by_name(obj, prog_name); else bpf_prog = bpf_object__next_program(obj, NULL); if (!bpf_prog) { pr_warn("Couldn't find xdp program in bpf object%s%s\n", section_name ? " section " : "", section_name ?: ""); return ERR_PTR(-ENOENT); } xdp_prog = xdp_program__new(); if (IS_ERR(xdp_prog)) return xdp_prog; xdp_prog->prog_name = strdup(bpf_program__name(bpf_prog)); if (!xdp_prog->prog_name) { err = -ENOMEM; goto err; } err = xdp_program__parse_btf(xdp_prog, bpf_object__btf(obj)); if (err && err != -ENOENT) goto err; xdp_prog->bpf_prog = bpf_prog; xdp_prog->bpf_obj = obj; xdp_prog->btf = bpf_object__btf(obj); xdp_prog->from_external_obj = external; return xdp_prog; err: xdp_program__close(xdp_prog); return ERR_PTR(err); } struct xdp_program *xdp_program__from_bpf_obj(struct bpf_object *obj, const char *section_name) { struct xdp_program *prog; prog = xdp_program__create_from_obj(obj, section_name, NULL, true); /* xdp_program__create_from_obj does not return NULL */ if (!IS_ERR(prog)) return prog; return libxdp_err_ptr(PTR_ERR(prog), false); } static struct bpf_object *open_bpf_obj(const char *filename, struct bpf_object_open_opts *opts) { struct bpf_object *obj; int err; obj = bpf_object__open_file(filename, opts); err = libbpf_get_error(obj); if (err) { if (err == -ENOENT) pr_debug( "Couldn't load the eBPF program (libbpf said 'no such file').\n" "Maybe the program was compiled with a too old " "version of LLVM (need v9.0+)?\n"); return ERR_PTR(err); } return obj; } static struct xdp_program *__xdp_program__open_file(const char *filename, const char *section_name, const char *prog_name, struct bpf_object_open_opts *opts) { struct xdp_program *xdp_prog; struct bpf_object *obj; int err; if (!filename) return ERR_PTR(-EINVAL); obj = open_bpf_obj(filename, opts); if (IS_ERR(obj)) { err = PTR_ERR(obj); return ERR_PTR(err); } xdp_prog = xdp_program__create_from_obj(obj, section_name, prog_name, false); if (IS_ERR(xdp_prog)) bpf_object__close(obj); return xdp_prog; } struct xdp_program *xdp_program__open_file(const char *filename, const char *section_name, struct bpf_object_open_opts *opts) { struct xdp_program *prog; prog = __xdp_program__open_file(filename, section_name, NULL, opts); /* __xdp_program__open_file does not return NULL */ if (!IS_ERR(prog)) return prog; return libxdp_err_ptr(PTR_ERR(prog), false); } static bool try_bpf_file(char *buf, size_t buf_size, char *path, const char *progname) { struct stat sb = {}; if (try_snprintf(buf, buf_size, "%s/%s", path, progname)) return false; pr_debug("Looking for '%s'\n", buf); if (stat(buf, &sb)) return false; return true; } static int find_bpf_file(char *buf, size_t buf_size, const char *progname) { static char *bpf_obj_paths[] = { #ifdef DEBUG ".", #endif BPF_OBJECT_PATH, NULL }; char *path, **p; path = secure_getenv(XDP_OBJECT_ENVVAR); if (path && try_bpf_file(buf, buf_size, path, progname)) { return 0; } else if (!path) { for (p = bpf_obj_paths; *p; p++) if (try_bpf_file(buf, buf_size, *p, progname)) return 0; } pr_warn("Couldn't find a BPF file with name %s\n", progname); return -ENOENT; } static struct xdp_program *__xdp_program__find_file(const char *filename, const char *section_name, const char *prog_name, struct bpf_object_open_opts *opts) { struct xdp_program *prog; char buf[PATH_MAX]; int err; prog = xdp_program__find_embedded(filename, section_name, prog_name, opts); if (prog) return prog; err = find_bpf_file(buf, sizeof(buf), filename); if (err) return ERR_PTR(err); pr_debug("Loading XDP program from '%s' section '%s'\n", buf, section_name ?: (prog_name ?: "(unknown)")); return __xdp_program__open_file(buf, section_name, prog_name, opts); } struct xdp_program *xdp_program__find_file(const char *filename, const char *section_name, struct bpf_object_open_opts *opts) { struct xdp_program *prog; prog = __xdp_program__find_file(filename, section_name, NULL, opts); /* __xdp_program__find_file does not return NULL */ if (!IS_ERR(prog)) return prog; return libxdp_err_ptr(PTR_ERR(prog), false); } static int xdp_program__fill_from_fd(struct xdp_program *xdp_prog, int fd) { struct bpf_prog_info info = {}; __u32 len = sizeof(info); struct btf *btf = NULL; int err = 0, prog_fd; if (!xdp_prog) return -EINVAL; /* Duplicate the descriptor, as we take ownership of the fd below */ prog_fd = fcntl(fd, F_DUPFD_CLOEXEC, MIN_FD); if (prog_fd < 0) { err = -errno; pr_debug("Error on fcntl: %s", strerror(-err)); return err; } err = bpf_obj_get_info_by_fd(prog_fd, &info, &len); if (err) { err = -errno; pr_warn("couldn't get program info: %s", strerror(-err)); goto err; } if (!xdp_prog->prog_name) { xdp_prog->prog_name = strdup(info.name); if (!xdp_prog->prog_name) { err = -ENOMEM; pr_warn("failed to strdup program title"); goto err; } } if (info.btf_id && !xdp_prog->btf) { btf = btf__load_from_kernel_by_id(info.btf_id); if (!btf) { pr_warn("Couldn't get BTF for ID %ul\n", info.btf_id); goto err; } xdp_prog->btf = btf; } pr_debug("Duplicated fd %d to %d for prog %s\n", fd, prog_fd, xdp_prog->prog_name); memcpy(xdp_prog->prog_tag, info.tag, BPF_TAG_SIZE); xdp_prog->load_time = info.load_time; xdp_prog->prog_fd = prog_fd; xdp_prog->prog_id = info.id; xdp_prog->prog_type = info.type; return 0; err: close(prog_fd); btf__free(btf); return err; } struct xdp_program *xdp_program__from_fd(int fd) { struct xdp_program *xdp_prog = NULL; int err; xdp_prog = xdp_program__new(); if (IS_ERR(xdp_prog)) return libxdp_err_ptr(PTR_ERR(xdp_prog), false); err = xdp_program__fill_from_fd(xdp_prog, fd); if (err) goto err; err = xdp_program__parse_btf(xdp_prog, NULL); if (err && err != -ENOENT) goto err; return xdp_prog; err: xdp_program__close(xdp_prog); return libxdp_err_ptr(err, false); } struct xdp_program *xdp_program__from_id(__u32 id) { struct xdp_program *prog; int fd, err; fd = bpf_prog_get_fd_by_id(id); if (fd < 0) { err = -errno; pr_warn("couldn't get program fd: %s", strerror(-err)); return libxdp_err_ptr(err, false); } prog = xdp_program__from_fd(fd); // duplicated fd already in prog, close original close(fd); if (IS_ERR(prog)) { err = errno; errno = err; } return prog; } struct xdp_program *xdp_program__from_pin(const char *pin_path) { struct xdp_program *prog; int fd, err; fd = bpf_obj_get(pin_path); if (fd < 0) { err = -errno; pr_warn("couldn't get program fd from %s: %s", pin_path, strerror(-err)); return libxdp_err_ptr(err, false); } prog = xdp_program__from_fd(fd); // duplicated fd already in prog, close original close(fd); if (IS_ERR(prog)) { err = errno; errno = err; } return prog; } struct xdp_program *xdp_program__create(struct xdp_program_opts *opts) { const char *pin_path, *prog_name, *find_filename, *open_filename; struct bpf_object_open_opts *obj_opts; struct xdp_program *prog; struct bpf_object *obj; __u32 id; int fd; if (!opts || !OPTS_VALID(opts, xdp_program_opts)) goto err; obj = OPTS_GET(opts, obj, NULL); obj_opts = OPTS_GET(opts, opts, NULL); prog_name = OPTS_GET(opts, prog_name, NULL); find_filename = OPTS_GET(opts, find_filename, NULL); open_filename = OPTS_GET(opts, open_filename, NULL); pin_path = OPTS_GET(opts, pin_path, NULL); id = OPTS_GET(opts, id, 0); fd = OPTS_GET(opts, fd, 0); if (obj) { /* prog_name is optional */ if (obj_opts || find_filename || open_filename || pin_path || id || fd) goto err; prog = xdp_program__create_from_obj(obj, NULL, prog_name, true); } else if (find_filename) { /* prog_name, obj_opts is optional */ if (obj || open_filename || pin_path || id || fd) goto err; prog = __xdp_program__find_file(find_filename, NULL, prog_name, obj_opts); } else if (open_filename) { /* prog_name, obj_opts is optional */ if (obj || find_filename || pin_path || id || fd) goto err; prog = __xdp_program__open_file(open_filename, NULL, prog_name, obj_opts); } else if (pin_path) { if (obj || obj_opts || prog_name || find_filename || open_filename || id || fd) goto err; prog = xdp_program__from_pin(pin_path); } else if (id) { if (obj || obj_opts || prog_name || find_filename || open_filename || pin_path || fd) goto err; prog = xdp_program__from_id(id); } else if (fd) { if (obj || obj_opts || prog_name || find_filename || open_filename || pin_path || id) goto err; prog = xdp_program__from_fd(fd); } else { goto err; } if (IS_ERR(prog)) return libxdp_err_ptr(PTR_ERR(prog), true); return prog; err: return libxdp_err_ptr(-EINVAL, true); } static int cmp_xdp_programs(const void *_a, const void *_b) { const struct xdp_program *a = *(struct xdp_program * const *)_a; const struct xdp_program *b = *(struct xdp_program * const *)_b; int cmp; if (a->run_prio != b->run_prio) return a->run_prio < b->run_prio ? -1 : 1; cmp = strcmp(a->prog_name, b->prog_name); if (cmp) return cmp; /* Hopefully the two checks above will resolve most comparisons; in * cases where they don't, hopefully the checks below will keep the * order stable. */ /* loaded before non-loaded */ if (a->prog_fd >= 0 && b->prog_fd < 0) return -1; else if (a->prog_fd < 0 && b->prog_fd >= 0) return 1; /* two unloaded programs - compare by size */ if (a->bpf_prog && b->bpf_prog) { size_t size_a, size_b; size_a = bpf_program__insn_cnt(a->bpf_prog); size_b = bpf_program__insn_cnt(b->bpf_prog); if (size_a != size_b) return size_a < size_b ? -1 : 1; } cmp = memcmp(a->prog_tag, b->prog_tag, BPF_TAG_SIZE); if (cmp) return cmp; /* at this point we are really grasping for straws */ if (a->load_time != b->load_time) return a->load_time < b->load_time ? -1 : 1; return 0; } int xdp_program__pin(struct xdp_program *prog, const char *pin_path) { if (IS_ERR_OR_NULL(prog) || prog->prog_fd < 0) return libxdp_err(-EINVAL); return libxdp_err(bpf_program__pin(prog->bpf_prog, pin_path)); } static int xdp_program__load(struct xdp_program *prog) { bool is_loaded, autoload; int err; if (IS_ERR_OR_NULL(prog)) return -EINVAL; if (prog->prog_fd >= 0) return -EEXIST; if (!prog->bpf_obj || !prog->bpf_prog) return -EINVAL; /* bpf_program__set_autoload fails if the object is loaded, use this to * detect if it is (since libbpf doesn't expose an API to discover * this). This is necessary because of objects containing multiple * programs: if a user creates xdp_program references to programs in * such an object before loading it, they will get out of sync. */ autoload = bpf_program__autoload(prog->bpf_prog); is_loaded = !!bpf_program__set_autoload(prog->bpf_prog, autoload); if (is_loaded) { pr_debug("XDP program %s is already loaded with fd %d\n", xdp_program__name(prog), bpf_program__fd(prog->bpf_prog)); prog->is_frags = !!(bpf_program__flags(prog->bpf_prog) & BPF_F_XDP_HAS_FRAGS); } else { /* We got an explicit load request, make sure we actually load */ if (!autoload) bpf_program__set_autoload(prog->bpf_prog, true); /* Make sure we sync is_frags to internal state variable (in case it was * changed on bpf_prog since creation), and unset flag if we're loading * an EXT program (the dispatcher will have the flag set instead in this * case) */ prog->is_frags = xdp_program__xdp_frags_support(prog); #ifdef HAVE_LIBBPF_BPF_PROGRAM__FLAGS if (bpf_program__type(prog->bpf_prog) == BPF_PROG_TYPE_EXT) { bpf_program__set_flags( prog->bpf_prog, bpf_program__flags(prog->bpf_prog) & ~(BPF_F_XDP_HAS_FRAGS | BPF_F_XDP_DEV_BOUND_ONLY)); } #endif err = bpf_object__load(prog->bpf_obj); if (err) return err; pr_debug("Loaded XDP program %s, got fd %d\n", xdp_program__name(prog), bpf_program__fd(prog->bpf_prog)); } /* xdp_program__fill_from_fd() clones the fd and takes ownership of the clone */ return xdp_program__fill_from_fd(prog, bpf_program__fd(prog->bpf_prog)); } struct xdp_program *xdp_program__clone(struct xdp_program *prog, unsigned int flags) { if (IS_ERR_OR_NULL(prog) || flags || (prog->prog_fd < 0 && !prog->bpf_obj)) return libxdp_err_ptr(-EINVAL, false); if (prog->prog_fd >= 0) /* Clone a loaded program struct by creating a new object from the program fd; xdp_program__fill_from_fd() already duplicates the fd before filling in the object, so this creates a completely independent xdp_program object. */ return xdp_program__from_fd(prog->prog_fd); return xdp_program__create_from_obj(prog->bpf_obj, NULL, prog->prog_name, true); } #ifndef HAVE_LIBBPF_BPF_PROGRAM__FLAGS static bool kernel_has_frags_support(void) { pr_debug("Can't support frags with old version of libbpf that doesn't support setting program flags.\n"); return false; } static bool kernel_has_dev_bound(void) { pr_debug("Can't bind to device with old version of libbpf that doesn't support setting program flags.\n"); return false; } static int xdp_program__set_xdp_dev_bound(__unused struct xdp_program *prog, __unused unsigned int ifindex) { return libxdp_err(-EOPNOTSUPP); } #else static const char *get_bpf_flag_name(__u32 flag) { switch (flag) { case BPF_F_XDP_DEV_BOUND_ONLY: return "BPF_F_XDP_DEV_BOUND_ONLY"; case BPF_F_XDP_HAS_FRAGS: return "BPF_F_XDP_HAS_FRAGS"; default: return NULL; } } static bool kernel_has_bpf_flag(__u32 flag) { struct xdp_program *test_prog; bool ret = false; int err; pr_debug("Checking for kernel frags support\n"); test_prog = __xdp_program__find_file("xdp-dispatcher.o", NULL, "xdp_pass", NULL); if (IS_ERR(test_prog)) { err = PTR_ERR(test_prog); pr_warn("Couldn't open BPF file xdp-dispatcher.o\n"); return false; } bpf_program__set_flags(test_prog->bpf_prog, flag); err = xdp_program__load(test_prog); if (!err) { pr_debug("Kernel supports XDP programs with flag: %s\n", get_bpf_flag_name(flag)); ret = true; } else { pr_debug("Kernel DOES NOT support XDP programs with flag: %s\n", get_bpf_flag_name(flag)); } xdp_program__close(test_prog); return ret; } static bool kernel_has_frags_support(void) { return kernel_has_bpf_flag(BPF_F_XDP_HAS_FRAGS); } static bool kernel_has_dev_bound(void) { return kernel_has_bpf_flag(BPF_F_XDP_DEV_BOUND_ONLY); } static int xdp_program__set_xdp_dev_bound(struct xdp_program *prog, unsigned int ifindex) { __u32 prog_flags; int ret; if (IS_ERR_OR_NULL(prog) || !prog->bpf_prog || prog->prog_fd >= 0) return libxdp_err(-EINVAL); if (!kernel_has_dev_bound()) { pr_warn("Current kernel version does not support XDP device binding."); return libxdp_err(-ENOTSUP); } prog_flags = bpf_program__flags(prog->bpf_prog); if (ifindex > 0) prog_flags |= BPF_F_XDP_DEV_BOUND_ONLY; else prog_flags &= ~BPF_F_XDP_DEV_BOUND_ONLY; ret = bpf_program__set_flags(prog->bpf_prog, prog_flags); bpf_program__set_ifindex(prog->bpf_prog, ifindex); if (!ret) return libxdp_err(ret); return 0; } #endif // HAVE_LIBBPF_BPF_PROGRAM__FLAGS static int xdp_program__attach_single(struct xdp_program *prog, int ifindex, enum xdp_attach_mode mode, unsigned int flags) { int err; if (prog->prog_fd < 0) { if (!kernel_has_frags_support()) xdp_program__set_xdp_frags_support(prog, false); bpf_program__set_type(prog->bpf_prog, BPF_PROG_TYPE_XDP); if (flags & XDP_ATTACH_DEVBIND) { err = xdp_program__set_xdp_dev_bound(prog, ifindex); if (err) return err; } err = xdp_program__load(prog); if (err) return err; } if (prog->prog_fd < 0) return -EINVAL; return xdp_attach_fd(xdp_program__fd(prog), -1, ifindex, mode); } static int xdp_multiprog__main_fd(struct xdp_multiprog *mp) { if (IS_ERR_OR_NULL(mp)) return -EINVAL; if (!mp->main_prog) return -ENOENT; return mp->main_prog->prog_fd; } static __u32 xdp_multiprog__main_id(struct xdp_multiprog *mp) { if (IS_ERR_OR_NULL(mp) || !mp->main_prog) return 0; return mp->main_prog->prog_id; } static int xdp_multiprog__hw_fd(struct xdp_multiprog *mp) { if (IS_ERR_OR_NULL(mp)) return -EINVAL; if (!mp->hw_prog) return -ENOENT; return mp->hw_prog->prog_fd; } static __u32 xdp_multiprog__hw_id(struct xdp_multiprog *mp) { if (IS_ERR_OR_NULL(mp) || !mp->hw_prog) return 0; return mp->hw_prog->prog_id; } static int xdp_program__attach_hw(struct xdp_program *prog, int ifindex) { struct bpf_map *map; bpf_program__set_ifindex(prog->bpf_prog, ifindex); bpf_object__for_each_map (map, prog->bpf_obj) { bpf_map__set_ifindex(map, ifindex); } return xdp_program__attach_single(prog, ifindex, XDP_MODE_HW, 0); } static int xdp_multiprog__detach_hw(struct xdp_multiprog *old_mp) { int err = 0, hw_fd = -1, ifindex = -1; if (!old_mp) return -EINVAL; ifindex = old_mp->ifindex; hw_fd = xdp_multiprog__hw_fd(old_mp); if (hw_fd < 0) return -EINVAL; err = xdp_attach_fd(-1, hw_fd, ifindex, XDP_MODE_HW); if (err < 0) return err; pr_debug("Detached hw program on ifindex '%d'\n", ifindex); return 0; } int xdp_program__attach_multi(struct xdp_program **progs, size_t num_progs, int ifindex, enum xdp_attach_mode mode, unsigned int flags) { struct xdp_multiprog *old_mp = NULL, *mp; int err = 0, retry_counter = 0; if (!progs || !num_progs || flags & ~XDP_ATTACH_FLAGS) return libxdp_err(-EINVAL); retry: old_mp = xdp_multiprog__get_from_ifindex(ifindex); if (IS_ERR_OR_NULL(old_mp)) old_mp = NULL; if (mode == XDP_MODE_HW) { bool old_hw_prog = xdp_multiprog__hw_prog(old_mp) != NULL; xdp_multiprog__close(old_mp); if (old_hw_prog) { pr_warn("XDP program already loaded in HW mode on ifindex %d; " "replacing HW mode programs not supported\n", ifindex); return libxdp_err(-EEXIST); } if (num_progs > 1) return libxdp_err(-EINVAL); return libxdp_err(xdp_program__attach_hw(progs[0], ifindex)); } if (num_progs == 1) { char *envval; envval = secure_getenv(XDP_SKIP_ENVVAR); if (envval && envval[0] == '1' && envval[1] == '\0') { pr_debug("Skipping dispatcher due to environment setting\n"); return libxdp_err(xdp_program__attach_single(progs[0], ifindex, mode, flags)); } } mp = xdp_multiprog__generate(progs, num_progs, ifindex, old_mp, false, flags); if (IS_ERR(mp)) { err = PTR_ERR(mp); mp = NULL; if (err == -EOPNOTSUPP) { if (num_progs == 1) { pr_info("Falling back to loading single prog " "without dispatcher\n"); return libxdp_err(xdp_program__attach_single( progs[0], ifindex, mode, flags)); } else { pr_warn("Can't fall back to legacy load with %zu " "programs\n%s\n", num_progs, dispatcher_feature_err); } } goto out; } err = xdp_multiprog__pin(mp); if (err) { pr_warn("Failed to pin program: %s\n", strerror(-err)); goto out_close; } err = xdp_multiprog__attach(old_mp, mp, mode); if (err) { pr_debug("Failed to attach dispatcher on ifindex %d: %s\n", ifindex, strerror(-err)); xdp_multiprog__unpin(mp); if (err == -EAGAIN) { if (++retry_counter > MAX_RETRY) { pr_warn("Retried more than %d times, giving up\n", retry_counter); err = -EBUSY; goto out_close; } pr_debug("Existing dispatcher replaced while building replacement, retrying.\n"); xdp_multiprog__close(old_mp); xdp_multiprog__close(mp); usleep(1 << retry_counter); /* exponential backoff */ goto retry; } goto out_close; } if (old_mp) { err = xdp_multiprog__unpin(old_mp); if (err) { pr_warn("Failed to unpin old dispatcher: %s\n", strerror(-err)); err = 0; } } out_close: xdp_multiprog__close(mp); out: if (old_mp) xdp_multiprog__close(old_mp); return libxdp_err(err); } int xdp_program__attach(struct xdp_program *prog, int ifindex, enum xdp_attach_mode mode, unsigned int flags) { if (IS_ERR_OR_NULL(prog) || IS_ERR(prog)) return libxdp_err(-EINVAL); return libxdp_err(xdp_program__attach_multi(&prog, 1, ifindex, mode, flags)); } int xdp_program__detach_multi(struct xdp_program **progs, size_t num_progs, int ifindex, enum xdp_attach_mode mode, unsigned int flags) { struct xdp_multiprog *new_mp, *mp; int err = 0, retry_counter = 0; size_t i; if (flags || !num_progs || !progs) return libxdp_err(-EINVAL); retry: new_mp = NULL; mp = xdp_multiprog__get_from_ifindex(ifindex); if (IS_ERR_OR_NULL(mp)) { pr_warn("No XDP dispatcher found on ifindex %d\n", ifindex); return libxdp_err(-ENOENT); } if (mode == XDP_MODE_HW || xdp_multiprog__is_legacy(mp)) { __u32 id = (mode == XDP_MODE_HW) ? xdp_multiprog__hw_id(mp) : xdp_multiprog__main_id(mp); if (num_progs > 1) { pr_warn("Can only detach one program in legacy or HW mode\n"); err = -EINVAL; goto out; } if (!xdp_program__id(progs[0])) { pr_warn("Program 0 not loaded\n"); err = -EINVAL; goto out; } if (id != xdp_program__id(progs[0])) { pr_warn("Asked to unload prog %u but %u is loaded\n", xdp_program__id(progs[0]), id); err = -ENOENT; goto out; } } if (mode == XDP_MODE_HW) { err = xdp_multiprog__detach_hw(mp); goto out; } if (mode != XDP_MODE_UNSPEC && mp->attach_mode != mode) { pr_warn("XDP dispatcher attached in mode %d, requested %d\n", mp->attach_mode, mode); err = -ENOENT; goto out; } if (xdp_multiprog__is_legacy(mp)) { err = xdp_multiprog__attach(mp, NULL, mode); goto out; } /* fist pass - check progs and count number still loaded */ for (i = 0; i < num_progs; i++) { struct xdp_program *p = NULL; bool found = false; if (!progs[i]->prog_id) { pr_warn("Program %zu not loaded\n", i); err = -EINVAL; goto out; } while ((p = xdp_multiprog__next_prog(p, mp))) { if (progs[i]->prog_id == p->prog_id) found = true; } if (!found) { pr_warn("Couldn't find program with id %d on ifindex %d\n", progs[i]->prog_id, ifindex); err = -ENOENT; goto out; } } if (num_progs == mp->num_links) { err = xdp_multiprog__attach(mp, NULL, mp->attach_mode); if (err) goto out; err = xdp_multiprog__unpin(mp); if (err) goto out; } else { new_mp = xdp_multiprog__generate(progs, num_progs, ifindex, mp, true, flags); if (IS_ERR(new_mp)) { err = PTR_ERR(new_mp); if (err == -EOPNOTSUPP) { pr_warn("Asked to detach %zu progs, but %zu loaded on ifindex %d, " "and partial detach is not supported by the kernel.\n", num_progs, mp->num_links, ifindex); } goto out; } err = xdp_multiprog__pin(new_mp); if (err) { pr_warn("Failed to pin program: %s\n", strerror(-err)); goto out; } err = xdp_multiprog__attach(mp, new_mp, mode); if (err) { pr_debug("Failed to attach dispatcher on ifindex %d: %s\n", ifindex, strerror(-err)); xdp_multiprog__unpin(new_mp); goto out; } err = xdp_multiprog__unpin(mp); if (err) { pr_warn("Failed to unpin old dispatcher: %s\n", strerror(-err)); err = 0; } } out: xdp_multiprog__close(mp); xdp_multiprog__close(new_mp); if (err == -EAGAIN) { if (++retry_counter > MAX_RETRY) { pr_warn("Retried more than %d times, giving up\n", retry_counter); return libxdp_err(-EBUSY); } pr_debug("Existing dispatcher replaced while building replacement, retrying.\n"); usleep(1 << retry_counter); /* exponential backoff */ goto retry; } return libxdp_err(err); } int xdp_program__detach(struct xdp_program *prog, int ifindex, enum xdp_attach_mode mode, unsigned int flags) { if (IS_ERR_OR_NULL(prog) || IS_ERR(prog)) return -EINVAL; return libxdp_err(xdp_program__detach_multi(&prog, 1, ifindex, mode, flags)); } int xdp_program__test_run(struct xdp_program *prog, struct bpf_test_run_opts *opts, unsigned int flags) { struct xdp_multiprog *mp = NULL; int err, prog_fd; if (IS_ERR_OR_NULL(prog) || flags) return libxdp_err(-EINVAL); if (prog->prog_fd < 0) { err = xdp_program__load(prog); if (err) return libxdp_err(err); } if (prog->prog_type == BPF_PROG_TYPE_EXT) { mp = xdp_multiprog__generate(&prog, 1, 0, NULL, false, flags); if (IS_ERR(mp)) { err = PTR_ERR(mp); if (err == -EOPNOTSUPP) pr_warn("Program was already attached to a dispatcher, " "and kernel doesn't support multiple attachments\n"); return libxdp_err(err); } prog_fd = xdp_multiprog__main_fd(mp); } else if (prog->prog_type != BPF_PROG_TYPE_XDP) { pr_warn("Can't test_run non-XDP programs\n"); return libxdp_err(-ENOEXEC); } else { prog_fd = prog->prog_fd; } err = bpf_prog_test_run_opts(prog_fd, opts); if (err) err = -errno; if (mp) xdp_multiprog__close(mp); return libxdp_err(err); } void xdp_multiprog__close(struct xdp_multiprog *mp) { struct xdp_program *p, *next = NULL; if (IS_ERR_OR_NULL(mp)) return; xdp_program__close(mp->main_prog); for (p = mp->first_prog; p; p = next) { next = p->next; xdp_program__close(p); } xdp_program__close(mp->hw_prog); free(mp); } static struct xdp_multiprog *xdp_multiprog__new(int ifindex) { struct xdp_multiprog *mp; mp = malloc(sizeof *mp); if (!mp) return ERR_PTR(-ENOMEM); memset(mp, 0, sizeof(*mp)); mp->ifindex = ifindex; mp->version = XDP_DISPATCHER_VERSION; return mp; } static int xdp_multiprog__load(struct xdp_multiprog *mp) { char buf[100]; int err = 0; if (IS_ERR_OR_NULL(mp) || !mp->main_prog || mp->is_loaded || xdp_multiprog__is_legacy(mp)) return -EINVAL; pr_debug("Loading multiprog dispatcher. Progs: %d frags: %s device-bound: %s/%d\n", mp->config.num_progs_enabled, mp->config.is_xdp_frags ? "yes" : "no", mp->config.is_xdp_devbound ? "yes" : "no", mp->config.is_xdp_devbound ? 0 : mp->ifindex); if (mp->config.is_xdp_frags) xdp_program__set_xdp_frags_support(mp->main_prog, true); if (mp->config.is_xdp_devbound) xdp_program__set_xdp_dev_bound(mp->main_prog, mp->ifindex); err = xdp_program__load(mp->main_prog); if (err) { pr_info("Failed to load dispatcher: %s\n", libxdp_strerror_r(err, buf, sizeof(buf))); err = -EOPNOTSUPP; goto out; } mp->is_loaded = true; out: return err; } int check_xdp_prog_version(const struct btf *btf, const char *name, __u32 *version) { const struct btf_type *sec, *def; sec = btf_get_datasec(btf, XDP_METADATA_SECTION); if (!sec) return libxdp_err(-ENOENT); def = btf_get_section_var(btf, sec, name, BTF_KIND_PTR); if (IS_ERR(def)) return libxdp_err(PTR_ERR(def)); if (!get_field_int(btf, name, def, version)) return libxdp_err(-ENOENT); return 0; } static int check_dispatcher_version(struct xdp_multiprog *mp, const char *prog_name, const struct btf *btf, __u32 nr_maps, __u32 map_id) { __u32 version = 0, map_key = 0, info_len = sizeof(struct bpf_map_info); const char *name = "dispatcher_version"; struct bpf_map_info map_info = {}; int err, map_fd, i; __u8 *buf = NULL; if (prog_name && strcmp(prog_name, "xdp_dispatcher")) { pr_debug("XDP program with name '%s' is not a dispatcher\n", prog_name); return -ENOENT; } if (nr_maps != 1) { pr_warn("Expected a single map for dispatcher, found %u\n", nr_maps); return -ENOENT; } map_fd = bpf_map_get_fd_by_id(map_id); if (map_fd < 0) { err = -errno; pr_warn("Could not get config map fd for id %u: %s\n", map_id, strerror(-err)); return err; } err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len); if (err) { err = -errno; pr_warn("Couldn't get map info: %s\n", strerror(-err)); goto out; } if (map_info.key_size != sizeof(map_key) || map_info.value_size < 2 || map_info.max_entries != 1 || !(map_info.map_flags & BPF_F_RDONLY_PROG)) { pr_warn("Map flags or key/value size mismatch\n"); err = -EINVAL; goto out; } buf = malloc(map_info.value_size); if (!buf) { err = -ENOMEM; goto out; } err = bpf_map_lookup_elem(map_fd, &map_key, buf); if (err) { err = -errno; pr_warn("Could not lookup map value: %s\n", strerror(-err)); goto out; } if (buf[0] == XDP_DISPATCHER_MAGIC) { version = buf[1]; } else { err = check_xdp_prog_version(btf, name, &version); if (err) goto out; } switch (version) { case XDP_DISPATCHER_VERSION_V1: { struct xdp_dispatcher_config_v1 *config = (void *)buf; for (i = 0; i < MAX_DISPATCHER_ACTIONS; i++) { mp->config.chain_call_actions[i] = config->chain_call_actions[i]; mp->config.run_prios[i] = config->run_prios[i]; } mp->config.num_progs_enabled = config->num_progs_enabled; break; } case XDP_DISPATCHER_VERSION_V2: { struct xdp_dispatcher_config_v2 *config = (void *)buf; for (i = 0; i < MAX_DISPATCHER_ACTIONS; i++) { mp->config.chain_call_actions[i] = config->chain_call_actions[i]; mp->config.run_prios[i] = config->run_prios[i]; mp->config.program_flags[i] = config->program_flags[i]; } mp->config.num_progs_enabled = config->num_progs_enabled; mp->config.is_xdp_frags = config->is_xdp_frags; mp->config.dispatcher_version = config->dispatcher_version; mp->config.magic = config->magic; break; } case XDP_DISPATCHER_VERSION: if (map_info.value_size != sizeof(mp->config)) { pr_warn("Dispatcher version matches, but map size %u != expected %zu\n", map_info.value_size, sizeof(mp->config)); err = -EINVAL; goto out; } memcpy(&mp->config, buf, sizeof(mp->config)); break; default: pr_warn("XDP dispatcher version %u higher than supported %u\n", version, XDP_DISPATCHER_VERSION); err = -EOPNOTSUPP; goto out; } pr_debug("Verified XDP dispatcher version %d <= %d\n", version, XDP_DISPATCHER_VERSION); mp->version = version; out: close(map_fd); free(buf); return err; } static int xdp_multiprog__link_pinned_progs(struct xdp_multiprog *mp) { char buf[PATH_MAX], pin_path[PATH_MAX]; struct xdp_program *prog, *p = NULL; const char *bpffs_dir; int err, lock_fd, i; struct stat sb = {}; if (IS_ERR_OR_NULL(mp) || mp->first_prog) return -EINVAL; bpffs_dir = get_bpffs_dir(); if (IS_ERR(bpffs_dir)) return PTR_ERR(bpffs_dir); err = try_snprintf(pin_path, sizeof(pin_path), "%s/dispatch-%d-%d", bpffs_dir, mp->ifindex, mp->main_prog->prog_id); if (err) return err; lock_fd = xdp_lock_acquire(); if (lock_fd < 0) return lock_fd; pr_debug("Reading multiprog component programs from pinned directory\n"); err = stat(pin_path, &sb); if (err) { err = -errno; pr_debug("Couldn't stat pin_path '%s': %s\n", pin_path, strerror(-err)); goto out; } for (i = 0; i < mp->config.num_progs_enabled; i++) { err = try_snprintf(buf, sizeof(buf), "%s/prog%d-prog", pin_path, i); if (err) goto err; prog = xdp_program__from_pin(buf); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto err; } err = try_snprintf(buf, sizeof(buf), "prog%d", i); if (err) goto err; prog->attach_name = strdup(buf); if (!prog->attach_name) { err = -ENOMEM; goto err; } prog->chain_call_actions = (mp->config.chain_call_actions[i] & ~(1U << XDP_DISPATCHER_RETVAL)); prog->run_prio = mp->config.run_prios[i]; prog->is_frags = !!(mp->config.program_flags[i] & BPF_F_XDP_HAS_FRAGS); if (!p) { mp->first_prog = prog; p = mp->first_prog; } else { p->next = prog; p = prog; } mp->num_links++; } out: xdp_lock_release(lock_fd); return err; err: prog = mp->first_prog; while (prog) { p = prog->next; xdp_program__close(prog); prog = p; } mp->first_prog = NULL; goto out; } static int xdp_multiprog__fill_from_fd(struct xdp_multiprog *mp, int prog_fd, int hw_fd) { struct bpf_prog_info info = {}; __u32 info_len, map_id = 0; struct xdp_program *prog; struct btf *btf = NULL; int err = 0; if (IS_ERR_OR_NULL(mp)) return -EINVAL; if (prog_fd > 0) { info.nr_map_ids = 1; info.map_ids = (uintptr_t)&map_id; info_len = sizeof(info); err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); if (err) { pr_warn("couldn't get program info for fd: %d", prog_fd); return -EINVAL; } if (!info.btf_id) { pr_debug("No BTF for prog ID %u\n", info.id); mp->is_legacy = true; goto legacy; } btf = btf__load_from_kernel_by_id(info.btf_id); if (!btf) { pr_warn("Couldn't get BTF for ID %ul\n", info.btf_id); goto out; } err = check_dispatcher_version(mp, info.name, btf, info.nr_map_ids, map_id); if (err) { if (err != -ENOENT) { pr_warn("Dispatcher version check failed for ID %d\n", info.id); goto out; } else { /* no dispatcher, mark as legacy prog */ mp->is_legacy = true; err = 0; goto legacy; } } legacy: prog = xdp_program__from_fd(prog_fd); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out; } mp->main_prog = prog; if (!xdp_multiprog__is_legacy(mp)) { err = xdp_multiprog__link_pinned_progs(mp); if (err) { pr_warn("Unable to read pinned progs: %s\n", strerror(-err)); mp->is_legacy = true; err = 0; } } pr_debug("Found %s with id %d and %zu component progs\n", xdp_multiprog__is_legacy(mp) ? "legacy program" : "multiprog", mp->main_prog->prog_id, mp->num_links); } if (hw_fd > 0) { prog = xdp_program__from_fd(hw_fd); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out; } if (mp->first_prog == NULL) mp->is_legacy = true; mp->hw_prog = prog; pr_debug("Found hw program with id %d\n", mp->hw_prog->prog_id); } mp->is_loaded = true; out: btf__free(btf); return err; } static struct xdp_multiprog *xdp_multiprog__from_fd(int fd, int hw_fd, int ifindex) { struct xdp_multiprog *mp = NULL; int err; mp = xdp_multiprog__new(ifindex); if (IS_ERR(mp)) return mp; err = xdp_multiprog__fill_from_fd(mp, fd, hw_fd); if (err) goto err; return mp; err: xdp_multiprog__close(mp); return ERR_PTR(err); } static struct xdp_multiprog *xdp_multiprog__from_id(__u32 id, __u32 hw_id, int ifindex) { struct xdp_multiprog *mp; int hw_fd = 0; int fd = 0; int err; if (id) { fd = bpf_prog_get_fd_by_id(id); if (fd < 0) { err = -errno; pr_warn("couldn't get program fd: %s", strerror(-err)); goto err; } } if (hw_id) { hw_fd = bpf_prog_get_fd_by_id(hw_id); if (hw_fd < 0) { err = -errno; pr_warn("couldn't get program fd: %s", strerror(-err)); goto err; } } mp = xdp_multiprog__from_fd(fd, hw_fd, ifindex); if (IS_ERR(mp)) { err = PTR_ERR(mp); goto err; } // duplicated fd/hw_fd already in prog, close originals if (fd > 0) close(fd); if (hw_fd > 0) close(hw_fd); return mp; err: if (fd > 0) close(fd); if (hw_fd > 0) close(hw_fd); return ERR_PTR(err); } static int xdp_get_ifindex_prog_id(int ifindex, __u32 *prog_id, __u32 *hw_prog_id, enum xdp_attach_mode *mode) { __u32 _prog_id, _drv_prog_id, _hw_prog_id, _skb_prog_id; enum xdp_attach_mode _mode; __u8 _attach_mode; if (!hw_prog_id) hw_prog_id = &_prog_id; if (!mode) mode = &_mode; int err; #ifdef HAVE_LIBBPF_BPF_XDP_ATTACH LIBBPF_OPTS(bpf_xdp_query_opts, opts); err = bpf_xdp_query(ifindex, 0, &opts); if (err) return err; _drv_prog_id = opts.drv_prog_id; _skb_prog_id = opts.skb_prog_id; _hw_prog_id = opts.hw_prog_id; _attach_mode = opts.attach_mode; #else struct xdp_link_info xinfo = {}; err = bpf_get_link_xdp_info(ifindex, &xinfo, sizeof(xinfo), 0); if (err) return err; _drv_prog_id = xinfo.drv_prog_id; _skb_prog_id = xinfo.skb_prog_id; _hw_prog_id = xinfo.hw_prog_id; _attach_mode = xinfo.attach_mode; #endif switch (_attach_mode) { case XDP_ATTACHED_SKB: *prog_id = _skb_prog_id; *mode = XDP_MODE_SKB; break; case XDP_ATTACHED_DRV: *prog_id = _drv_prog_id; *mode = XDP_MODE_NATIVE; break; case XDP_ATTACHED_MULTI: if (_drv_prog_id) { *prog_id = _drv_prog_id; *mode = XDP_MODE_NATIVE; } else if (_skb_prog_id) { *prog_id = _skb_prog_id; *mode = XDP_MODE_SKB; } *hw_prog_id = _hw_prog_id; break; case XDP_ATTACHED_HW: *hw_prog_id = _hw_prog_id; *mode = XDP_MODE_UNSPEC; break; case XDP_ATTACHED_NONE: default: *mode = XDP_MODE_UNSPEC; break; } return 0; } struct xdp_multiprog *xdp_multiprog__get_from_ifindex(int ifindex) { enum xdp_attach_mode mode = XDP_MODE_UNSPEC; int err, retry_counter = 0; struct xdp_multiprog *mp; __u32 hw_prog_id = 0; __u32 prog_id = 0; retry: err = xdp_get_ifindex_prog_id(ifindex, &prog_id, &hw_prog_id, &mode); if (err) return libxdp_err_ptr(err, false); if (!prog_id && !hw_prog_id) return libxdp_err_ptr(-ENOENT, false); mp = xdp_multiprog__from_id(prog_id, hw_prog_id, ifindex); if (!IS_ERR_OR_NULL(mp)) mp->attach_mode = mode; else if (IS_ERR(mp)) { err = PTR_ERR(mp); if (err == -ENOENT) { if (++retry_counter > MAX_RETRY) { pr_warn("Retried more than %d times, giving up\n", retry_counter); err = -EBUSY; } else { pr_debug("Dispatcher disappeared before we could load it, retrying.\n"); usleep(1 << retry_counter); /* exponential backoff */ goto retry; } } mp = libxdp_err_ptr(err, false); } else mp = libxdp_err_ptr(0, true); return mp; } int libxdp_check_kern_compat(void) { struct xdp_program *tgt_prog = NULL, *test_prog = NULL; const char *bpffs_dir; char buf[PATH_MAX]; int lock_fd; int err = 0; bpffs_dir = get_bpffs_dir(); if (IS_ERR(bpffs_dir)) { err = PTR_ERR(bpffs_dir); pr_warn("Can't use dispatcher without a working bpffs\n"); return -EOPNOTSUPP; } if (kernel_compat > COMPAT_UNKNOWN) goto skip; pr_debug("Checking dispatcher compatibility\n"); tgt_prog = __xdp_program__find_file("xdp-dispatcher.o", NULL, "xdp_pass", NULL); if (IS_ERR(tgt_prog)) { err = PTR_ERR(tgt_prog); pr_warn("Couldn't open BPF file xdp-dispatcher.o\n"); return err; } test_prog = __xdp_program__find_file("xdp-dispatcher.o", NULL, "xdp_pass", NULL); if (IS_ERR(test_prog)) { err = PTR_ERR(test_prog); pr_warn("Couldn't open BPF file xdp-dispatcher.o\n"); return err; } err = xdp_program__load(tgt_prog); if (err) { pr_debug("Couldn't load XDP program: %s\n", strerror(-err)); goto out; } err = bpf_program__set_attach_target(test_prog->bpf_prog, tgt_prog->prog_fd, "xdp_pass"); if (err) { pr_debug("Failed to set attach target: %s\n", strerror(-err)); goto out; } bpf_program__set_type(test_prog->bpf_prog, BPF_PROG_TYPE_EXT); bpf_program__set_expected_attach_type(test_prog->bpf_prog, 0); err = xdp_program__load(test_prog); if (err) { char buf[100] = {}; libxdp_strerror(err, buf, sizeof(buf)); pr_debug("Failed to load program %s: %s\n", xdp_program__name(test_prog), buf); goto out; } test_prog->link_fd = bpf_raw_tracepoint_open(NULL, test_prog->prog_fd); if (test_prog->link_fd < 0) { err = -errno; pr_debug("Failed to attach test program to dispatcher: %s\n", strerror(-err)); goto out; } err = try_snprintf(buf, sizeof(buf), "%s/prog-test-link-%i-%i", bpffs_dir, IFINDEX_LO, test_prog->prog_id); if (err) goto out; lock_fd = xdp_lock_acquire(); if (lock_fd < 0) { err = lock_fd; goto out; } err = bpf_obj_pin(test_prog->link_fd, buf); if (err) { err = -errno; pr_warn("Couldn't pin link FD at %s: %s\n", buf, strerror(-err)); goto out_locked; } err = unlink(buf); if (err) { err = -errno; pr_warn("Couldn't unlink file %s: %s\n", buf, strerror(-err)); goto out_locked; } kernel_compat = COMPAT_SUPPORTED; out_locked: xdp_lock_release(lock_fd); out: xdp_program__close(test_prog); xdp_program__close(tgt_prog); if (err) { pr_info("Compatibility check for dispatcher program failed: %s\n", strerror(-err)); kernel_compat = COMPAT_UNSUPPORTED; } skip: return kernel_compat == COMPAT_SUPPORTED ? 0 : -EOPNOTSUPP; } static int find_prog_btf_id(const char *name, __u32 attach_prog_fd) { struct bpf_prog_info info = {}; __u32 info_size = sizeof(info); int err = -EINVAL; struct btf *btf; err = bpf_obj_get_info_by_fd(attach_prog_fd, &info, &info_size); if (err) { err = -errno; pr_warn("failed get_prog_info for FD %d\n", attach_prog_fd); return err; } if (!info.btf_id) { pr_warn("The target program doesn't have BTF\n"); return -EINVAL; } btf = btf__load_from_kernel_by_id(info.btf_id); if (!btf) { pr_warn("Failed to get BTF of the program\n"); return -EINVAL; } err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); btf__free(btf); if (err <= 0) pr_warn("%s is not found in prog's BTF\n", name); return err; } static int xdp_multiprog__link_prog(struct xdp_multiprog *mp, struct xdp_program *prog) { DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); struct xdp_program *new_prog, *p; bool was_loaded = false; char buf[PATH_MAX]; int err, lfd = -1; char *attach_func; __s32 btf_id; if (IS_ERR_OR_NULL(mp) || IS_ERR_OR_NULL(prog) || !mp->is_loaded || mp->num_links >= mp->config.num_progs_enabled) return -EINVAL; err = libxdp_check_kern_compat(); if (err) return err; if (!prog->btf) { pr_warn("Program %s has no BTF information, so we can't load it as multiprog\n", xdp_program__name(prog)); return -EOPNOTSUPP; } pr_debug("Linking prog %s as multiprog entry %zu\n", xdp_program__name(prog), mp->num_links); err = try_snprintf(buf, sizeof(buf), "prog%zu", mp->num_links); if (err) goto err; if (mp->config.num_progs_enabled == 1) attach_func = "xdp_dispatcher"; else attach_func = buf; btf_id = find_prog_btf_id(attach_func, mp->main_prog->prog_fd); if (btf_id <= 0) { err = btf_id; pr_debug("Couldn't find BTF ID for %s: %d\n", attach_func, err); goto err; } if (prog->prog_fd < 0) { err = bpf_program__set_attach_target(prog->bpf_prog, mp->main_prog->prog_fd, attach_func); if (err) { pr_debug("Failed to set attach target: %s\n", strerror(-err)); goto err; } bpf_program__set_type(prog->bpf_prog, BPF_PROG_TYPE_EXT); bpf_program__set_expected_attach_type(prog->bpf_prog, 0); err = xdp_program__load(prog); if (err) { if (err == -E2BIG) { pr_debug("Got 'argument list too long' error while " "loading component program.\n"); err = -EOPNOTSUPP; } else { char buf[100] = {}; libxdp_strerror(err, buf, sizeof(buf)); pr_debug("Failed to load program %s: %s\n", xdp_program__name(prog), buf); } goto err; } was_loaded = true; } /* clone the xdp_program ref so we can keep it */ new_prog = xdp_program__clone(prog, 0); if (IS_ERR(new_prog)) { err = PTR_ERR(new_prog); pr_warn("Failed to clone xdp_program: %s\n", strerror(-err)); goto err; } opts.target_btf_id = btf_id; /* The attach will disappear once this fd is closed */ lfd = bpf_link_create(new_prog->prog_fd, mp->main_prog->prog_fd, 0, &opts); if (lfd < 0) { err = -errno; if (err == -EINVAL) { if (!was_loaded) { pr_debug("Kernel doesn't support re-attaching " "freplace programs.\n"); err = -EOPNOTSUPP; } else { pr_debug("Got EINVAL, retrying " "raw_tracepoint_open() without target\n"); /* we just loaded the program, so should be able * to attach the old way */ lfd = bpf_raw_tracepoint_open(NULL, new_prog->prog_fd); if (lfd < 0) err = -errno; else goto attach_ok; } } if (err == -EPERM) { pr_debug("Got 'permission denied' error while " "attaching program to dispatcher.\n%s\n", dispatcher_feature_err); err = -EOPNOTSUPP; } else { pr_warn("Failed to attach program %s to dispatcher: %s\n", xdp_program__name(new_prog), strerror(-err)); } goto err_free; } attach_ok: new_prog->attach_name = strdup(buf); if (!new_prog->attach_name) { err = -ENOMEM; goto err_free; } pr_debug( "Attached prog '%s' with priority %d in dispatcher entry '%s' with fd %d\n", xdp_program__name(new_prog), xdp_program__run_prio(new_prog), new_prog->attach_name, lfd); new_prog->link_fd = lfd; if (!mp->first_prog) { mp->first_prog = new_prog; } else { p = mp->first_prog; while (p->next) p = p->next; p->next = new_prog; } mp->num_links++; return 0; err_free: if (lfd >= 0) close(lfd); xdp_program__close(new_prog); err: return err; } /* * xdp_multiprog__generate - generate a new multiprog dispatcher * * This generates a new multiprog dispatcher for the programs in progs. If * old_mp is set, the progs will either be added to or removed from the existing * set of programs in the dispatcher represented by old_mp, depending on the * value of remove_progs. If old_mp is not set, a new dispatcher will be created * just holding the programs in progs. In both cases, the full set of programs * will be sorted according to their run order (see cmp_xdp_programs). * * When called with remove_progs set, the caller is responsible for checking * that all the programs in progs are actually present in old_mp. */ static struct xdp_multiprog *xdp_multiprog__generate(struct xdp_program **progs, size_t num_progs, int ifindex, struct xdp_multiprog *old_mp, bool remove_progs, unsigned int flags) { size_t num_new_progs = old_mp ? old_mp->num_links : 0; struct xdp_program **new_progs = NULL; struct xdp_program *dispatcher; struct xdp_multiprog *mp; struct bpf_map *map; size_t i; int err; if (!progs || !num_progs || (!old_mp && remove_progs)) return ERR_PTR(-EINVAL); num_new_progs += remove_progs ? -num_progs : num_progs; if (num_new_progs > MAX_DISPATCHER_ACTIONS) { pr_warn("Not enough free slots in the dispatcher.\n"); return ERR_PTR(-E2BIG); } if (!remove_progs && old_mp) { if (old_mp->config.is_xdp_devbound) { if (!(flags & XDP_ATTACH_DEVBIND)) { pr_warn("Dispatcher is already bound to ifindex %d. You did not specify XDP_ATTACH_DEVBIND in the attach flags of the new program\n", old_mp->ifindex); return ERR_PTR(-EINVAL); } } else if (flags & XDP_ATTACH_DEVBIND) { pr_warn("Dispatcher was not bound to a device. Cannot rebind it, some old programs may require access to multiple interfaces\n"); return ERR_PTR(-EINVAL); } } pr_debug("Generating multi-prog dispatcher for %zu programs\n", num_new_progs); mp = xdp_multiprog__new(ifindex); if (IS_ERR(mp)) return mp; mp->kernel_frags_support = kernel_has_frags_support(); mp->kernel_devbound_support = kernel_has_dev_bound(); if (old_mp) { struct xdp_program *prog; size_t j; if (xdp_multiprog__is_legacy(old_mp)) { pr_warn("Existing program is not using a dispatcher, can't replace; unload first\n"); err = -EBUSY; goto err; } if (old_mp->version < mp->version) { pr_warn("Existing dispatcher version %u is older than our version %u. " "Refusing transparent upgrade, unload first\n", old_mp->version, mp->version); err = -EBUSY; goto err; } new_progs = calloc(num_new_progs, sizeof(*new_progs)); if (!new_progs) { err = -ENOMEM; goto err; } for (i = 0, prog = old_mp->first_prog; prog; prog = prog->next) { if (remove_progs) { /* remove_new means new_progs is an array of * programs we should remove from old_mp instead * of adding them. */ bool found = false; for (j = 0; j < num_progs; j++) if (progs[j]->prog_id == prog->prog_id) found = true; if (found) continue; /* Sanity check: caller should ensure all * programs to remove actually exist; check here * anyway to ensure we don't overrun the array * if this is not done correctly. */ if (i >= num_new_progs) { pr_warn("Not all programs to remove were found\n"); err = -EINVAL; goto err; } } new_progs[i++] = prog; } if (!remove_progs) for (j = 0; i < num_new_progs; i++, j++) new_progs[i] = progs[j]; } else { new_progs = progs; } if (num_new_progs > 1) qsort(new_progs, num_new_progs, sizeof(*new_progs), cmp_xdp_programs); dispatcher = __xdp_program__find_file("xdp-dispatcher.o", NULL, "xdp_dispatcher", NULL); if (IS_ERR(dispatcher)) { err = PTR_ERR(dispatcher); pr_warn("Couldn't open BPF file 'xdp-dispatcher.o'\n"); goto err; } mp->main_prog = dispatcher; map = bpf_object__next_map(mp->main_prog->bpf_obj, NULL); if (!map) { pr_warn("Couldn't find rodata map in object file 'xdp-dispatcher.o'\n"); err = -ENOENT; goto err; } mp->config.magic = XDP_DISPATCHER_MAGIC; mp->config.dispatcher_version = mp->version; mp->config.num_progs_enabled = num_new_progs; mp->config.is_xdp_frags = mp->kernel_frags_support; mp->config.is_xdp_devbound = !!old_mp ? old_mp->config.is_xdp_devbound : (flags & XDP_ATTACH_DEVBIND); for (i = 0; i < num_new_progs; i++) { mp->config.chain_call_actions[i] = (new_progs[i]->chain_call_actions | (1U << XDP_DISPATCHER_RETVAL)); mp->config.run_prios[i] = new_progs[i]->run_prio; if (xdp_program__xdp_frags_support(new_progs[i])) mp->config.program_flags[i] |= BPF_F_XDP_HAS_FRAGS; else mp->config.is_xdp_frags = false; if (mp->config.is_xdp_devbound) mp->config.program_flags[i] |= BPF_F_XDP_DEV_BOUND_ONLY; } if (mp->kernel_frags_support) { if (!mp->config.is_xdp_frags) pr_debug("At least one attached program doesn't " "support frags, disabling it for the " "dispatcher\n"); else pr_debug("All attached programs support frags, " "enabling it for the dispatcher\n"); } err = bpf_map__set_initial_value(map, &mp->config, sizeof(mp->config)); if (err) { pr_warn("Failed to set rodata for object file 'xdp-dispatcher.o'\n"); goto err; } err = xdp_multiprog__load(mp); if (err) goto err; for (i = 0; i < num_new_progs; i++) { err = xdp_multiprog__link_prog(mp, new_progs[i]); if (err) goto err; } if (old_mp) free(new_progs); return mp; err: if (old_mp) free(new_progs); xdp_multiprog__close(mp); return ERR_PTR(err); } static int xdp_multiprog__pin(struct xdp_multiprog *mp) { char pin_path[PATH_MAX], buf[PATH_MAX]; struct xdp_program *prog; const char *bpffs_dir; int err = 0, lock_fd; if (IS_ERR_OR_NULL(mp) || xdp_multiprog__is_legacy(mp)) return -EINVAL; bpffs_dir = get_bpffs_dir(); if (IS_ERR(bpffs_dir)) return PTR_ERR(bpffs_dir); err = try_snprintf(pin_path, sizeof(pin_path), "%s/dispatch-%d-%d", bpffs_dir, mp->ifindex, mp->main_prog->prog_id); if (err) return err; lock_fd = xdp_lock_acquire(); if (lock_fd < 0) return lock_fd; pr_debug("Pinning multiprog fd %d beneath %s\n", mp->main_prog->prog_fd, pin_path); err = mkdir(pin_path, S_IRWXU); if (err && errno != EEXIST) { err = -errno; goto out; } for (prog = mp->first_prog; prog; prog = prog->next) { if (prog->link_fd < 0) { err = -EINVAL; pr_warn("Prog %s not linked\n", prog->prog_name); goto err_unpin; } err = try_snprintf(buf, sizeof(buf), "%s/%s-link", pin_path, prog->attach_name); if (err) goto err_unpin; err = bpf_obj_pin(prog->link_fd, buf); if (err) { err = -errno; pr_warn("Couldn't pin link FD at %s: %s\n", buf, strerror(-err)); goto err_unpin; } pr_debug("Pinned link for prog %s at %s\n", prog->prog_name, buf); err = try_snprintf(buf, sizeof(buf), "%s/%s-prog", pin_path, prog->attach_name); if (err) goto err_unpin; err = bpf_obj_pin(prog->prog_fd, buf); if (err) { err = -errno; pr_warn("Couldn't pin prog FD at %s: %s\n", buf, strerror(-err)); goto err_unpin; } pr_debug("Pinned prog %s at %s\n", prog->prog_name, buf); } out: xdp_lock_release(lock_fd); return err; err_unpin: for (prog = mp->first_prog; prog; prog = prog->next) { if (!try_snprintf(buf, sizeof(buf), "%s/%s-link", pin_path, prog->attach_name)) unlink(buf); if (!try_snprintf(buf, sizeof(buf), "%s/%s-prog", pin_path, prog->attach_name)) unlink(buf); } rmdir(pin_path); goto out; } static int xdp_multiprog__unpin(struct xdp_multiprog *mp) { char pin_path[PATH_MAX], buf[PATH_MAX]; struct xdp_program *prog; const char *bpffs_dir; int err = 0, lock_fd; if (IS_ERR_OR_NULL(mp) || xdp_multiprog__is_legacy(mp)) return -EINVAL; bpffs_dir = get_bpffs_dir(); if (IS_ERR(bpffs_dir)) return PTR_ERR(bpffs_dir); err = try_snprintf(pin_path, sizeof(pin_path), "%s/dispatch-%d-%d", bpffs_dir, mp->ifindex, mp->main_prog->prog_id); if (err) return err; lock_fd = xdp_lock_acquire(); if (lock_fd < 0) return lock_fd; pr_debug("Unpinning multiprog fd %d beneath %s\n", mp->main_prog->prog_fd, pin_path); for (prog = mp->first_prog; prog; prog = prog->next) { err = try_snprintf(buf, sizeof(buf), "%s/%s-link", pin_path, prog->attach_name); if (err) goto out; err = unlink(buf); if (err) { err = -errno; pr_warn("Couldn't unlink file %s: %s\n", buf, strerror(-err)); goto out; } pr_debug("Unpinned link for prog %s from %s\n", prog->prog_name, buf); err = try_snprintf(buf, sizeof(buf), "%s/%s-prog", pin_path, prog->attach_name); if (err) goto out; err = unlink(buf); if (err) { err = -errno; pr_warn("Couldn't unlink file %s: %s\n", buf, strerror(-err)); goto out; } pr_debug("Unpinned prog %s from %s\n", prog->prog_name, buf); } err = rmdir(pin_path); if (err) err = -errno; pr_debug("Removed pin directory %s\n", pin_path); out: xdp_lock_release(lock_fd); return err; } static int xdp_detach_link(__u32 ifindex, __u32 prog_id) { struct bpf_link_info link_info; __u32 link_info_len, id = 0; int err, fd; while (true) { err = bpf_link_get_next_id(id, &id); if (err) { err = -errno; pr_debug("Can't get next link for id %u: %s", id, strerror(errno)); return err; } fd = bpf_link_get_fd_by_id(id); if (fd < 0) { err = -errno; pr_debug("Can't get link by id %u: %s", id, strerror(errno)); return err; } memset(&link_info, 0, sizeof(link_info)); link_info_len = sizeof(link_info); err = bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len); if (err) { err = -errno; pr_debug("Can't get link info for %u: %s\n", id, strerror(errno)); break; } if (link_info.type == BPF_LINK_TYPE_XDP && link_info.xdp.ifindex == ifindex && link_info.prog_id == prog_id) { pr_debug("Detach link for id %u for prog %u on interface %u\n", id, prog_id, ifindex); err = bpf_link_detach(fd); if (err) { err = -errno; pr_warn("Can't detach link %u: %s\n", id, strerror(errno)); } break; } close(fd); } close(fd); return err; } static int xdp_multiprog__attach(struct xdp_multiprog *old_mp, struct xdp_multiprog *mp, enum xdp_attach_mode mode) { int err = 0, prog_fd = -1, old_fd = -1, ifindex = -1; if (IS_ERR_OR_NULL(mp) && !old_mp) return -EINVAL; if (mode == XDP_MODE_HW) return -EINVAL; if (mp) { prog_fd = xdp_multiprog__main_fd(mp); if (prog_fd < 0) return -EINVAL; ifindex = mp->ifindex; } if (old_mp) { old_fd = xdp_multiprog__main_fd(old_mp); if (old_fd < 0) return -EINVAL; if (ifindex > -1 && ifindex != old_mp->ifindex) return -EINVAL; ifindex = old_mp->ifindex; } err = xdp_attach_fd(prog_fd, old_fd, ifindex, mode); if (err < 0) { if (errno == EBUSY && !mp) { pr_debug("Detaching link on ifindex %d\n", ifindex); return xdp_detach_link(ifindex, xdp_multiprog__main_id(old_mp)); } goto err; } if (mp) pr_debug("Loaded %zu programs on ifindex %d%s\n", mp->num_links, ifindex, mode == XDP_MODE_SKB ? " in skb mode" : ""); else pr_debug("Detached %s on ifindex %d%s\n", xdp_multiprog__is_legacy(old_mp) ? "program" : "multiprog", ifindex, mode == XDP_MODE_SKB ? " in skb mode" : ""); return 0; err: return err; } int xdp_multiprog__detach(struct xdp_multiprog *mp) { int err = 0; if (IS_ERR_OR_NULL(mp) || !mp->is_loaded) return libxdp_err(-EINVAL); if (mp->hw_prog) { err = xdp_multiprog__detach_hw(mp); if (err) return libxdp_err(err); } if (mp->main_prog) { err = xdp_multiprog__attach(mp, NULL, mp->attach_mode); if (err) return libxdp_err(err); if (!xdp_multiprog__is_legacy(mp)) err = xdp_multiprog__unpin(mp); } return libxdp_err(err); } struct xdp_program *xdp_multiprog__next_prog(const struct xdp_program *prog, const struct xdp_multiprog *mp) { if (IS_ERR_OR_NULL(mp) || xdp_multiprog__is_legacy(mp)) return libxdp_err_ptr(0, true); if (prog) return prog->next; return mp->first_prog; } struct xdp_program *xdp_multiprog__hw_prog(const struct xdp_multiprog *mp) { if (IS_ERR_OR_NULL(mp)) return libxdp_err_ptr(0, true); return mp->hw_prog; } enum xdp_attach_mode xdp_multiprog__attach_mode(const struct xdp_multiprog *mp) { if (IS_ERR_OR_NULL(mp)) return XDP_MODE_UNSPEC; return mp->attach_mode; } struct xdp_program *xdp_multiprog__main_prog(const struct xdp_multiprog *mp) { if (IS_ERR_OR_NULL(mp)) return libxdp_err_ptr(0, true); return mp->main_prog; } bool xdp_multiprog__is_legacy(const struct xdp_multiprog *mp) { if (IS_ERR_OR_NULL(mp)) return false; return mp->is_legacy; } int xdp_multiprog__program_count(const struct xdp_multiprog *mp) { if (IS_ERR_OR_NULL(mp)) return libxdp_err(-EINVAL); return mp->num_links; } bool xdp_multiprog__xdp_frags_support(const struct xdp_multiprog *mp) { return !xdp_multiprog__is_legacy(mp) && mp->config.is_xdp_frags; } bool xdp_multiprog__xdp_dev_bound(const struct xdp_multiprog *mp) { return !xdp_multiprog__is_legacy(mp) && mp->config.is_xdp_devbound; } static int remove_pin_dir(const char *subdir) { char prog_path[PATH_MAX], pin_path[PATH_MAX]; int err; DIR *d; const char *dir = get_bpffs_dir(); if (IS_ERR(dir)) return PTR_ERR(dir); err = try_snprintf(pin_path, sizeof(pin_path), "%s/%s", dir, subdir); if (err) return err; d = opendir(pin_path); if (!d) { err = -errno; pr_warn("Failed to open pin directory: %s\n", strerror(-err)); return err; } for (struct dirent *dent = readdir(d); dent; dent = readdir(d)) { /* skip . and .. */ if (dent->d_type == DT_DIR) continue; err = try_snprintf(prog_path, sizeof(prog_path), "%s/%s", pin_path, dent->d_name); if (err) goto err; err = unlink(prog_path); if (err) { err = -errno; pr_warn("Couldn't unlink file %s/%s: %s\n", subdir, dent->d_name, strerror(-err)); goto err; } } err = rmdir(pin_path); if (err) { err = -errno; pr_warn("Failed to remove pin directory %s: %s\n", pin_path, strerror(-err)); } err: closedir(d); return err; } int libxdp_clean_references(int ifindex) { int err = 0, lock_fd, path_ifindex; __u32 dir_prog_id, prog_id = 0; DIR *d; const char *dir = get_bpffs_dir(); if (IS_ERR(dir)) return libxdp_err(PTR_ERR(dir)); lock_fd = xdp_lock_acquire(); if (lock_fd < 0) return libxdp_err(lock_fd); d = opendir(dir); if (!d) { err = -errno; pr_debug("Failed to open bpffs directory: %s\n", strerror(-err)); goto out; } for (struct dirent *dent = readdir(d); dent; dent = readdir(d)) { if (dent->d_type != DT_DIR) continue; if (sscanf(dent->d_name, "dispatch-%d-%"PRIu32"", &path_ifindex, &dir_prog_id) != 2) continue; /* If ifindex is set, skip this dir if it doesn't match */ if (ifindex && path_ifindex != ifindex) continue; xdp_get_ifindex_prog_id(path_ifindex, &prog_id, NULL, NULL); if (!prog_id || prog_id != dir_prog_id) { pr_info("Prog id %"PRIu32" no longer attached on ifindex %d, removing pin directory %s\n", dir_prog_id, path_ifindex, dent->d_name); err = remove_pin_dir(dent->d_name); if (err) break; } } closedir(d); out: xdp_lock_release(lock_fd); return libxdp_err(err); } xdp-tools-1.6.1/lib/libxdp/libxdp.map000066400000000000000000000040111514310632100174510ustar00rootroot00000000000000LIBXDP_1.0.0 { global: libxdp_get_error; libxdp_set_print; libxdp_strerror; xdp_multiprog__attach_mode; xdp_multiprog__close; xdp_multiprog__detach; xdp_multiprog__get_from_ifindex; xdp_multiprog__is_legacy; xdp_multiprog__next_prog; xdp_multiprog__main_prog; xdp_multiprog__hw_prog; xdp_program__attach; xdp_program__attach_multi; xdp_program__bpf_obj; xdp_program__btf; xdp_program__chain_call_enabled; xdp_program__close; xdp_program__detach; xdp_program__detach_multi; xdp_program__find_file; xdp_program__from_bpf_obj; xdp_program__from_fd; xdp_program__from_id; xdp_program__from_pin; xdp_program__fd; xdp_program__id; xdp_program__is_attached; xdp_program__name; xdp_program__open_file; xdp_program__pin; xdp_program__print_chain_call_actions; xdp_program__run_prio; xdp_program__set_chain_call_enabled; xdp_program__set_run_prio; xdp_program__tag; }; LIBXDP_1.2.0 { libxdp_clean_references; xdp_multiprog__program_count; xsk_setup_xdp_prog; xsk_socket__create; xsk_socket__create_shared; xsk_socket__delete; xsk_socket__fd; xsk_socket__update_xskmap; xsk_umem__create; xsk_umem__delete; xsk_umem__fd; xsk_cons_nb_avail; xsk_prod_nb_free; xsk_ring_cons__cancel; xsk_ring_cons__comp_addr; xsk_ring_cons__peek; xsk_ring_cons__release; xsk_ring_cons__rx_desc; xsk_ring_prod__fill_addr; xsk_ring_prod__needs_wakeup; xsk_ring_prod__reserve; xsk_ring_prod__submit; xsk_ring_prod__tx_desc; xsk_umem__add_offset_to_addr; xsk_umem__extract_addr; xsk_umem__extract_offset; xsk_umem__get_data; } LIBXDP_1.0.0; LIBXDP_1.3.0 { xdp_multiprog__xdp_frags_support; xdp_program__clone; xdp_program__create; xdp_program__set_xdp_frags_support; xdp_program__test_run; xdp_program__xdp_frags_support; } LIBXDP_1.2.0; LIBXDP_1.4.0 { xsk_umem__create_with_fd; } LIBXDP_1.3.0; LIBXDP_1.5.0 { xsk_umem__create_opts; xsk_socket__create_opts; } LIBXDP_1.4.0; LIBXDP_1.6.0 { xdp_multiprog__xdp_dev_bound; } LIBXDP_1.5.0; xdp-tools-1.6.1/lib/libxdp/libxdp.mk000066400000000000000000000003171514310632100173100ustar00rootroot00000000000000LIBXDP_VERSION := $(shell sed -ne "/LIBXDP_[0-9\.]\+ {/ {s/LIBXDP_\([0-9\.]\+\) {/\1/;p;}" $(LIB_DIR)/libxdp/libxdp.map | tail -n 1) LIBXDP_MAJOR_VERSION := $(shell echo $(LIBXDP_VERSION) | sed 's/\..*//') xdp-tools-1.6.1/lib/libxdp/libxdp.pc.template000066400000000000000000000003671514310632100211220ustar00rootroot00000000000000# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) prefix=@PREFIX@ libdir=@LIBDIR@ includedir=${prefix}/include Name: libxdp Description: XDP library Version: @VERSION@ Libs: -L${libdir} -lxdp Requires.private: libbpf Cflags: -I${includedir} xdp-tools-1.6.1/lib/libxdp/libxdp_internal.h000066400000000000000000000075101514310632100210260ustar00rootroot00000000000000#ifndef __LIBXDP_LIBXDP_INTERNAL_H #define __LIBXDP_LIBXDP_INTERNAL_H #include #include #include #include #include #include #include #define LIBXDP_HIDE_SYMBOL __attribute__((visibility("hidden"))) #define __unused __attribute__((unused)) #define __printf(a, b) __attribute__((format(printf, a, b))) static inline int try_snprintf(char *buf, size_t buf_len, const char *format, ...) { va_list args; int len; va_start(args, format); len = vsnprintf(buf, buf_len, format, args); va_end(args); if (len < 0) return -EINVAL; else if ((size_t)len >= buf_len) return -ENAMETOOLONG; return 0; } LIBXDP_HIDE_SYMBOL __printf(2, 3) void libxdp_print(enum libxdp_print_level level, const char *format, ...); #define __pr(level, fmt, ...) \ do { \ libxdp_print(level, "libxdp: " fmt, ##__VA_ARGS__); \ } while (0) #define pr_warn(fmt, ...) __pr(LIBXDP_WARN, fmt, ##__VA_ARGS__) #define pr_info(fmt, ...) __pr(LIBXDP_INFO, fmt, ##__VA_ARGS__) #define pr_debug(fmt, ...) __pr(LIBXDP_DEBUG, fmt, ##__VA_ARGS__) LIBXDP_HIDE_SYMBOL int check_xdp_prog_version(const struct btf *btf, const char *name, __u32 *version); LIBXDP_HIDE_SYMBOL int libxdp_check_kern_compat(void); #define min(x, y) ((x) < (y) ? x : y) #define max(x, y) ((x) > (y) ? x : y) #ifndef offsetof #define offsetof(type, member) ((size_t) & ((type *)0)->member) #endif #ifndef offsetofend #define offsetofend(TYPE, FIELD) (offsetof(TYPE, FIELD) + sizeof(((TYPE *)0)->FIELD)) #endif #ifndef container_of #define container_of(ptr, type, member) \ ({ \ const typeof(((type *)0)->member) *__mptr = (ptr); \ (type *)((char *)__mptr - offsetof(type, member)); \ }) #endif #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) /* OPTS macros, from libbpf_internal.h */ static inline bool libxdp_is_mem_zeroed(const char *obj, size_t off_start, size_t off_end) { const char *p; for (p = obj + off_start; p < obj + off_end; p++) { if (*p) return false; } return true; } static inline bool libxdp_validate_opts(const char *opts, size_t opts_sz, size_t user_sz, const char *type_name) { if (user_sz < sizeof(size_t)) { pr_warn("%s size (%zu) is too small\n", type_name, user_sz); return false; } if (!libxdp_is_mem_zeroed(opts, opts_sz, user_sz)) { pr_warn("%s has non-zero extra bytes\n", type_name); return false; } return true; } #define OPTS_VALID(opts, type) \ (!(opts) || libxdp_validate_opts((const char *)opts, \ offsetofend(struct type, \ type##__last_field), \ (opts)->sz, #type)) #define OPTS_HAS(opts, field) \ ((opts) && opts->sz >= offsetofend(typeof(*(opts)), field)) #define OPTS_GET(opts, field, fallback_value) \ (OPTS_HAS(opts, field) ? (opts)->field : fallback_value) #define OPTS_SET(opts, field, value) \ do { \ if (OPTS_HAS(opts, field)) \ (opts)->field = value; \ } while (0) #define OPTS_ZEROED(opts, last_nonzero_field) \ (!(opts) || libxdp_is_mem_zeroed((const void *)opts, \ offsetofend(typeof(*(opts)), \ last_nonzero_field), \ (opts)->sz)) /* handle direct returned errors */ static inline int libxdp_err(int ret) { if (ret < 0) errno = -ret; return ret; } /* handle error for pointer-returning APIs, err is assumed to be < 0 always */ static inline void *libxdp_err_ptr(int err, bool ret_null) { /* set errno on error, this doesn't break anything */ errno = -err; if (ret_null) return NULL; /* legacy: encode err as ptr */ return ERR_PTR(err); } LIBXDP_HIDE_SYMBOL int xdp_lock_acquire(void); LIBXDP_HIDE_SYMBOL int xdp_lock_release(int lock_fd); LIBXDP_HIDE_SYMBOL int xdp_attach_fd(int prog_fd, int old_fd, int ifindex, enum xdp_attach_mode mode); #endif /* __LIBXDP_LIBXDP_INTERNAL_H */ xdp-tools-1.6.1/lib/libxdp/protocol.org000066400000000000000000000605051514310632100200540ustar00rootroot00000000000000#+OPTIONS: ^:nil * Protocol for atomic loading of multi-prog dispatchers With the support for the =freplace= program type, it is possible to load multiple XDP programs on a single interface by building a /dispatcher/ program which will run on the interface, and which will call the component XDP programs as functions using the =freplace= type. For this to work in an interoperable way, applications need to agree on how to attach their XDP programs using this mechanism. This document outlines the protocol implemented by =libxdp=, serving as both documentation and a blueprint for anyone else who wants to implement the same protocol and interoperate. ** Generating a dispatcher The dispatcher is simply an XDP program that will call each of a number of stub functions in turn, and depending on their return code either continue on to the next function or return immediately. These stub functions are then replaced at load time with the user XDP programs, using the =freplace= functionality. *** Dispatcher format The dispatcher XDP program contains the main function containing the dispatcher logic, 10 stub functions that can be replaced by component BPF programs, and a configuration structure that is used by the dispatcher logic. In =libxdp=, this dispatcher is generated by [[https://github.com/xdp-project/xdp-tools/blob/main/lib/libxdp/xdp-dispatcher.c.in][an M4 macro file]] which expands to the following: #+begin_src C #define XDP_METADATA_SECTION "xdp_metadata" #define XDP_DISPATCHER_VERSION 2 #define XDP_DISPATCHER_MAGIC 236 #define XDP_DISPATCHER_RETVAL 31 #define MAX_DISPATCHER_ACTIONS 10 struct xdp_dispatcher_config { __u8 magic; /* Set to XDP_DISPATCHER_MAGIC */ __u8 dispatcher_version; /* Set to XDP_DISPATCHER_VERSION */ __u8 num_progs_enabled; /* Number of active program slots */ __u8 is_xdp_frags; /* Whether this dispatcher is loaded with XDP frags support */ __u32 chain_call_actions[MAX_DISPATCHER_ACTIONS]; __u32 run_prios[MAX_DISPATCHER_ACTIONS]; __u32 program_flags[MAX_DISPATCHER_ACTIONS]; }; /* While 'const volatile' sounds a little like an oxymoron, there's reason * behind the madness: * * - const places the data in rodata, where libbpf will mark it as read-only and * frozen on program load, letting the kernel do dead code elimination based * on the values. * * - volatile prevents the compiler from optimising away the checks based on the * compile-time value of the variables, which is important since we will be * changing the values before loading the program into the kernel. */ static volatile const struct xdp_dispatcher_config conf = {}; /* The volatile return value prevents the compiler from assuming it knows the * return value and optimising based on that. */ __attribute__ ((__noinline__)) int prog0(struct xdp_md *ctx) { volatile int ret = XDP_DISPATCHER_RETVAL; if (!ctx) return XDP_ABORTED; return ret; } /* the above is repeated as prog1...prog9 */ SEC("xdp") int xdp_dispatcher(struct xdp_md *ctx) { __u8 num_progs_enabled = conf.num_progs_enabled; int ret; if (num_progs_enabled < 1) goto out; ret = prog0(ctx); if (!((1U << ret) & conf.chain_call_actions[0])) return ret; /* the above is repeated for prog1...prog9 */ out: return XDP_PASS; } char _license[] SEC("license") = "GPL"; __uint(dispatcher_version, XDP_DISPATCHER_VERSION) SEC(XDP_METADATA_SECTION); #+end_src The dispatcher program is pre-compiled and distributed with =libxdp=. Because the configuration struct is marked as =const= in the source file, it will be put into the =rodata=, which libbpf will turn into a read-only (frozen) map on load. This allows the kernel verifier to perform dead code elimination based on the values in the map. This is also the reason for the =num_progs_enabled= member of the config struct: together with the checks in the main dispatcher function the verifier will effectively remove all the stub function calls not being used, without having to rely on dynamic compilation. When generating a dispatcher, this BPF object file is opened and the configuration struct is populated before the object is loaded. As a forward compatibility measure, =libxdp= will also check for the presence of the =dispatcher_version= field in the =xdp_metadata= section (encoded like the program metadata described in "Processing program metadata" below), and if it doesn't match the expected version (currently version 2), will abort any action. *** Populating the dispatcher configuration map On loading, the dispatcher configuration map is populated as follows: - The =magic= field is set to the =XDP_DISPATCHER_MAGIC= value (236). This field is here to make it possible to check if a program is a dispatcher without looking at the program BTF in the future. - The =dispatcher_version= field is set to the current dispatcher version (2). This is redundant with the BTF-encoded version in the metadata field, but must be checked so that the BTF metadata version can be removed in the future. See the section on old dispatcher versions below. - The =num_progs_enabled= member is simply set to the number of active programs that will be attached to this dispatcher. - The =is_xdp_frags= variable is set to 1 if dispatcher is loaded with XDP frags support (see section below), or 0 otherwise. The two other fields contain per-component program metadata, which is read from the component programs as explained in the "Processing program metadata" section below. - The =chain_call_actions= array is populated with a bitmap signifying which XDP actions (return codes) of each component program should be interpreted as a signal to continue execution of the next XDP program. For instance, a packet filtering program might designate that an =XDP_PASS= action should make execution continue, while other return codes should immediately end the call chain and return. The special =XDP_DISPATCHER_RETVAL= (which is set to 31 corresponding to the topmost bit in the bitmap) is always included in each programs' =chain_call_actions=; this value is returned by the stub functions, which ensures that should a component program become detached, processing will always continue past the stub function. - The =run_prios= array contains the effective run priority of each component program when it was installed. This is also read as program metadata, but because it can be overridden at load time, the effective value is stored in the configuration array so it can be carried forward when the dispatcher is replaced. Component programs are expected to be sorted in order of their run priority (as explained below in "Loading and attaching component programs"). - The =program_flags= is used to store the flags that an XDP program was loaded with. This is populated with the value of the =BPF_F_XDP_HAS_FRAGS= flag if the component program in this slot had that flag set (see the section on XDP frags support below), and is 0 otherwise. **** Processing program metadata As explained above, each component program must specify one or more chain call actions and a run priority on attach. When loading a user program, =libxdp= will attempt to read this metadata from the object file as explained in the following; if no values are found in the object file, a default run priority of 50 will be applied, and =XDP_PASS= will be the only chain call action. The metadata is read from the object file by looking for BTF-encoded metadata in the =.xdp_run_config= object section, encoded similar to the BTF-defined maps used by libbpf (in the =.maps= section). Here, =libxdp= will look for a struct definition with the XDP program function name prefixed by an underscore (e.g., if the main XDP function is called =xdp_main=, libxdp will look for a struct definition called =_xdp_main=). In this struct, a member =priority= encodes the run priority, each XDP action can be set as a chain call action by setting a struct member with the action name. The =xdp_helpers.h= header file included with XDP exposes helper macros that can be used with the existing helpers in =bpf_helpers.h= (from libbpf), so a full run configuration metadata section can be defined as follows: #+begin_src C #include #include struct { __uint(priority, 10); __uint(XDP_PASS, 1); __uint(XDP_DROP, 1); } XDP_RUN_CONFIG(my_xdp_func); #+end_src This example sets priority 10 with chain call actions =XDP_PASS= and =XDP_DROP= for the XDP program starting at =my_xdp_func()=. This turns into the following BTF information (as shown by =bpftool btf dump=): #+begin_src [12] STRUCT '(anon)' size=24 vlen=3 'priority' type_id=13 bits_offset=0 'XDP_PASS' type_id=15 bits_offset=64 'XDP_DROP' type_id=15 bits_offset=128 [13] PTR '(anon)' type_id=14 [14] ARRAY '(anon)' type_id=6 index_type_id=10 nr_elems=10 [15] PTR '(anon)' type_id=16 [16] ARRAY '(anon)' type_id=6 index_type_id=10 nr_elems=1 [17] VAR '_my_xdp_func' type_id=12, linkage=global-alloc [18] DATASEC '.xdp_run_config' size=0 vlen=1 type_id=17 offset=0 size=24 #+end_src The parser will look for the =.xdp_run_config= DATASEC, then follow the types recursively, extracting the field values from the =nr_elems= in the anonymous arrays in type IDs 14 and 16. While =libxdp= will automatically load any metadata specified as above in the program BTF, the application using =libxdp= can override these values at runtime. These overridden values will be the ones used when determining program order, and will be preserved in the dispatcher configuration map for subsequent operations. *** Old versions of the XDP dispatcher This document currently describes version 2 of the dispatcher and protocol. This differs from version 1 in the following respects: - The dispatcher configuration map has gained the =magic= and =dispatcher_version= fields for identifying the dispatcher and its version.. - The protocol now supports propagating the value of the =BPF_F_XDP_HAS_FRAGS= field for supporting XDP frags programs for higher MTU. The dispatcher configuration map has gained the =is_xdp_frags= and =program_flags= fields for use with this feature. The protocol for propagating the frags field is described below, and an implementation of this protocol that recognises version 2 of the dispatcher MUST implement this protocol. Older versions of libxdp will check the dispatcher version field of any dispatcher loaded in the kernel, and refuse to operate on a dispatcher with a higher version than the library version implements. This means that if a newer dispatcher is loaded, old versions of the library will be locked out of modifying that dispatcher. This is by design: old library versions don't recognise the semantics of new features added in subsequent versions, and so would introduce bugs if it attempted to operate on newer versions. Newer versions of libxdp will, however, recognise older dispatcher versions. If a newer version of libxdp loads a new program and finds an old dispatcher version already loaded on an interface, it will display the programs attached to it, but will refuse to replace it with a newer version so as not to lock out the program that loaded the program(s) already attached. Manually unloading the loaded programs will be required to load a new dispatcher version on the interface. *** Loading and attaching component programs When loading one or more XDP programs onto an interface (assuming no existing program is found on the interface; for adding programs, see below), =libxdp= first prepares a dispatcher program with the right number of slots, by populating the configuration struct as described above. Then, this dispatcher program is loaded into the kernel, with the =BPF_F_XDP_HAS_FRAGS= flag set if all component programs have that flag set (see the section on supporting XDP frags below). Having loaded the dispatcher program, =libxdp= then loads each of the component programs. To do this, first the list of component programs is sorted by their run priority, forming the final run sequence. Should several programs have the same run priority, ties are broken in the following arbitrary, but deterministic, order (see =cmp_xdp_programs()= [[https://github.com/xdp-project/xdp-tools/blob/main/lib/libxdp/libxdp.c][in libxdp.c]]): - By XDP function name (=bpf_program__name()= from libbpf) - By sorting already-loaded programs before not-yet-loaded ones - By unloaded programs by program size - By loaded program bpf tag value (using =memcmp()=) - By load time Before loading, each component program type is reset to =BPF_PROG_TYPE_EXT= with an expected attach type of 0, and the =BPF_F_XDP_HAS_FRAGS= is unset (see the section on supporting frags below). Then, the attachment target is set to the dispatcher file descriptor and the BTF ID of the stub function to replace (i.e., the first component program has =prog0()= as its target, and so on). Then the program is loaded, at which point the kernel will verify the component program's compatibility with the attach point. Having loaded the component program, it is attached to the dispatcher by way of =bpf_link_create()=, specifying the same target file description and BTF ID used when loading the program. This will return a link fd, which will be pinned to prevent the attachment to unravel when the fd is closed (see "Locking and pinning" below). *** Locking and pinning To prevent the kernel from detaching any =freplace= program when its last file description is closed, the programs must be pinned in =bpffs=. This is done in the =xdp= subdirectory of =bpffs=, which by default means =/sys/fs/bpf/xdp=. If the =LIBXDP_BPFFS= environment variable is set, this will override the location of the top-level =bpffs=, and the =xdp= subdirectory will be created beneath this path. The pathnames generated for pinning are the following: - /sys/fs/bpf/xdp/dispatch-IFINDEX-DID - dispatcher program for IFINDEX with BPF program ID DID - /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-prog - component program 0, program reference - /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog0-link - component program 0, bpf_link reference - /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-prog - component program 1, program reference - /sys/fs/bpf/xdp/dispatch-IFINDEX-DID/prog1-link - component program 1, bpf_link reference - etc, up to ten component programs This means that several pin operations have to be performed for each dispatcher program. Semantically, these are all atomic, so to make sure every consumer of the hierarchy of pinned files gets a consistent view, locking is needed. This is implemented by opening the parent directory =/sys/fs/bpf/xdp= with the =O_DIRECTORY= flag, and obtaining a lock on the resulting file descriptor using =flock(lock_fd, LOCK_EX)=. When creating a new dispatcher program, it will first be fully populated, with all component programs attached. Then, the programs will be linked in =bpffs= as specified above, and once this succeeds, the program will be attached to the interface. If attaching the program fails, the programs will be unpinned again, and the error returned to the caller. This order ensures atomic attachment to the interface, without any risk that component programs will be automatically detached due to a badly timed application crash. When loading the initial dispatcher program, the =XDP_FLAGS_UPDATE_IF_NOEXIST= flag is set to prevent accidentally overriding any concurrent modifications. If this fails, the whole operation starts over, turning the load into a modification as described below. *** Supporting XDP programs with frags support (BPF_F_XDP_HAS_FRAGS flag) Linux kernel 5.18 added support for a new API that allows XDP programs to access packet data that spans more than a single page, allowing XDP programs to be loaded on interfaces with bigger MTUs. Such packets will not have all their packet data accessible by the traditional "direct packet access"; instead, only the first fragment will be available this way, and the rest of the packet data has to be accessed via the new =bpf_xdp_load_bytes()= helper. Existing XDP programs are written with the assumption that they can see the whole packet data using direct packet access, which means they can subtly malfunction if some of the packet data is suddenly invisible (for instance, counting packet lengths is no longer accurate). Whether a given XDP program supports the frags API or not is a semantic issue, and it's not possible for the kernel to auto-detect this. For this reason, programs have to opt in to XDP frags support at load time, by setting the =BPF_F_XDP_HAS_FRAGS= flag as they are loaded into the kernel. Programs that are not loaded with this flag will be rejected from attaching to network devices that use packet fragment (i.e., those with a large MTU). This has implications for the XDP dispatcher, as its purpose is for multiple programs to be loaded at the same time. Since the =BPF_F_XDP_HAS_FRAGS= cannot be set for individual component programs, it has to be set for the dispatcher as a whole. However, as described above, programs can subtly malfunction if they are exposed to packets with fragments without being ready to do so. This means that it's only safe to set the =BPF_F_XDP_HAS_FRAGS= on the dispatcher itself if *all* component programs have the flag set. To properly propagate the flags even when adding new programs to an existing dispatcher, the dispatcher itself needs to keep track of which of its component programs had the =BPF_F_XDP_HAS_FRAGS= flag set when they were added. The dispatcher configuration map users the =program_flags= array for this: for each component program, this field is set to the value of the =BPF_F_XDP_HAS_FRAGS= flag if that component program has the flag set, and to 0 otherwise. An additional field, =is_xdp_frags=, is set if the dispatcher itself is loaded with the frags field set (which may not be the case if the kernel doesn't support the flag). When generating a dispatcher for a set of programs, libxdp simply tracks if all component programs support the =BPF_F_XDP_HAS_FRAGS=, and if they do, the dispatcher is loaded with this flag set. If any program attached to the dispatcher does not support the flag, the dispatcher is loaded without this flag set (and the =is_xdp_frags= field in the dispatcher configuration is set accordingly). If libxdp determines that the running kernel does not support the =BPF_F_XDP_HAS_FRAGS=, the dispatcher is loaded without the flag regardless of the value of the component programs. When adding a program to an existing dispatcher, this may result in a "downgrade", i.e., loading a new dispatcher without the frags flag to replace an existing dispatcher that does have the flag set. This will result in the replacement dispatcher being rejected by the kernel at attach time, but only if the interface being attached to actually requires the frags flag (i.e., if it has a large MTU). If the attachment is rejected, the old dispatcher will stay in place, leading to no loss of functionality. ** Adding or removing programs from an existing dispatcher The sections above explain how to generate a dispatcher and attach it to an interface, assuming no existing program is attached. When one or more programs is already attached, a couple of extra steps are required to ensure that the switch is made atomically. Briefly, changing the programs attached to an interface entails the following steps: - Reading the existing dispatcher program and obtaining references to the component programs. - Generating a new dispatcher containing the new set of programs (adding or removing the programs needed). - Atomically swapping out the XDP program attachment on the interface so the new dispatcher takes over from the old one. - Unpinning and dismantling the old dispatcher. These operations are each described in turn in the following sections. *** Reading list of existing programs from the kernel The first step is to obtain the ID of the currently loaded XDP program using =bpf_get_link_xdp_info()=. A file descriptor to the dispatcher is obtained using =bpf_prog_get_fd_by_id()=, and the BTF information attached to the program is obtained from the kernel. This is checked for the presence of the dispatcher version field (as explained above), and the operation is aborted if this is not present, or doesn't match what the library expects. Having thus established that the program loaded on the interface is indeed a compatible dispatcher, the map ID of the map containing the configuration struct is obtained from the kernel, and the configuration data is loaded from the map (after checking that the map value size matches the expected configuration struct). Then, the file lock on the directory in =bpffs= is obtained as explained in the "Locking and pinning" section above, and, while holding this lock, file descriptors to each of the component programs and =bpf_link= objects are obtained. The end result is a reference to the full dispatcher structure (and its component programs), corresponding to that generated on load. When populating the component program structure in memory, the chain call actions and run priority from the dispatcher configuration map is used instead of parsing the BTF metadata of each program: This ensures that any modified values specified at load time will be retained in stead of being reverted to the values compiled into the BTF metadata. Similarly, the =program_flags= array of the in-kernel dispatcher is used to determine which of the existing component programs support the =BPF_F_XDP_HAS_FRAGS= flag (see the section on frags support above). *** Generating a new dispatcher Having obtained a reference to the existing dispatcher, =libxdp= takes that and the list of programs to add to or remove from the interface, and simply generates a new dispatcher with the new set of programs. When adding programs, the whole list of programs is sorted according to their run priorities (as explained above), resulting in new programs being inserted in the right place in the existing sequence according to their priority. Generating this secondary dispatcher relies on the support for multiple attachments for =freplace= programs, which was added in kernel 5.10. This allows the =bpf_link_create()= operation to specify an attachment target in the new dispatcher. In other words, the component programs will briefly be attached to both the old and new dispatcher, but only one of those will be attached to the interface. After completion of the new dispatcher, its component programs are pinned in =bpffs= as described above. *** Atomic replace and retry At this point, =libxdp= has references to both the old dispatcher, already attached to the interface, and the new one with the modified set of component programs. The new dispatcher is then atomically swapped out with the old one, using the =XDP_FLAGS_REPLACE= flag to the netlink operation (and the accompanying =IFLA_XDP_EXPECTED_FD= attribute). Once the atomic replace operation succeeds, the old dispatcher is unpinned from =bppfs= and the in-memory references to both the old and new dispatchers are released (since the new dispatcher was already pinned, preventing it from being detached from the interface). Should this atomic replace instead *fail* because the program attached to the interface changed while the new dispatcher was being built, the whole operation is simply started over from the beginning. That is, the new dispatcher is unpinned from =bpffs=, and the in-memory references to both dispatchers are released (but no unpinning of the old dispatcher is performed!). Then, the program ID attached to the interface is again read from the kernel, and the operation proceeds from "Reading list of existing programs from the kernel". ** Compatibility with older kernels The full functionality described above can only be attained with kernels version 5.10 or newer, because this is the version that introduced support for re-attaching an freplace program in a secondary attachment point. However, the freplace functionality itself was introduced in kernel 5.7, so for kernel versions 5.7 to 5.9, multiple programs can be attached as long as they are all attached to the dispatcher immediately as they are loaded. This is achieved by using =bpf_raw_tracepoint_open()= in place of =bpf_link_create()= when attaching the component programs to the dispatcher. The =bpf_raw_tracepoint_open()= function doesn't take an attach target as a parameter; instead, it simply attached the freplace program to the target that was specified at load time (which is why it only works when all component programs are loaded together with the dispatcher). xdp-tools-1.6.1/lib/libxdp/tests/000077500000000000000000000000001514310632100166365ustar00rootroot00000000000000xdp-tools-1.6.1/lib/libxdp/tests/.gitignore000066400000000000000000000002311514310632100206220ustar00rootroot00000000000000test_xsk_refcnt check_kern_compat test_xdp_frags test_dispatcher_versions test_xsk_non_privileged test_link_detach test_xsk_umem_flags test_xdp_devbound xdp-tools-1.6.1/lib/libxdp/tests/Makefile000066400000000000000000000056431514310632100203060ustar00rootroot00000000000000# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) USER_TARGETS := test_xsk_refcnt test_xdp_frags test_link_detach test_xsk_umem_flags test_xdp_devbound STATIC_LINK_TARGETS := check_kern_compat test_dispatcher_versions BPF_TARGETS := xdp_dispatcher_v1 xdp_dispatcher_v2 xdp_pass USER_LIBS := -lpthread USER_EXTRA_C := test_utils.c EXTRA_DEPS += xdp_dispatcher.h EXTRA_USER_DEPS += test_utils.h $(USER_EXTRA_C) TEST_FILE := ./test-libxdp.sh TEST_RUNNER := ./test_runner.sh LIB_DIR := ../.. LDLIBS += $(USER_LIBS) include $(LIB_DIR)/libxdp/libxdp.mk include $(LIB_DIR)/defines.mk ifeq ($(HAVE_CAP_NG),y) USER_TARGETS += test_xsk_non_privileged CFLAGS += $(CAP_NG_CFLAGS) LDLIBS += $(CAP_NG_LDLIBS) endif USER_C := ${USER_TARGETS:=.c} ${STATIC_LINK_TARGETS:=.c} USER_OBJ := ${USER_C:.c=.o} BPF_OBJS := $(BPF_TARGETS:=.o) LDFLAGS+=-L$(LIBXDP_DIR) STATIC_OBJECT_LIBXDP:=$(LIBXDP_DIR)/libxdp.a STATIC_LDLIBS:=-l:libxdp.a $(LDLIBS) ifeq ($(DYNAMIC_LIBXDP),1) LDLIBS:=-lxdp $(LDLIBS) OBJECT_LIBXDP:=$(LIBXDP_DIR)/libxdp.so.$(LIBXDP_VERSION) else LDLIBS:=$(STATIC_LDLIBS) OBJECT_LIBXDP:=$(STATIC_OBJECT_LIBXDP) endif # Detect submodule libbpf source file changes ifeq ($(SYSTEM_LIBBPF),n) LIBBPF_SOURCES := $(wildcard $(LIBBPF_DIR)/src/*.[ch]) endif LIBXDP_SOURCES := $(wildcard $(LIBXDP_DIR)/*.[ch] $(LIBXDP_DIR)/*.in) CFLAGS += -I$(HEADER_DIR) BPF_HEADERS := $(wildcard $(HEADER_DIR)/bpf/*.h) $(wildcard $(HEADER_DIR)/xdp/*.h) all: $(USER_TARGETS) $(STATIC_LINK_TARGETS) $(BPF_OBJS) .PHONY: clean clean:: $(Q)rm -f $(USER_TARGETS) $(USER_OBJ) .PHONY: install install: all install -m 0755 -d $(DESTDIR)$(SCRIPTSDIR)/tests/libxdp install -m 0644 $(TEST_FILE) $(DESTDIR)$(SCRIPTSDIR)/tests/libxdp install -m 0755 $(USER_TARGETS) $(STATIC_LINK_TARGETS) $(DESTDIR)$(SCRIPTSDIR)/tests/libxdp install -m 0644 $(BPF_OBJS) $(DESTDIR)$(SCRIPTSDIR)/tests/libxdp $(OBJECT_LIBBPF): $(LIBBPF_SOURCES) $(Q)$(MAKE) -C $(LIB_DIR) libbpf $(OBJECT_LIBXDP): $(LIBXDP_SOURCES) $(Q)$(MAKE) -C $(LIBXDP_DIR) # Create expansions for dependencies LIB_H := ${LIB_OBJS:.o=.h} # Detect if any of common obj changed and create dependency on .h-files $(LIB_OBJS): %.o: %.c %.h $(LIB_H) $(Q)$(MAKE) -C $(dir $@) $(notdir $@) $(USER_TARGETS): %: %.c $(OBJECT_LIBBPF) $(OBJECT_LIBXDP) $(LIBMK) $(LIB_OBJS) $(EXTRA_DEPS) $(EXTRA_USER_DEPS) $(QUIET_CC)$(CC) -Wall $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o $@ $(LIB_OBJS) \ $< $(USER_EXTRA_C) $(LDLIBS) $(STATIC_LINK_TARGETS): %: %.c $(OBJECT_LIBBPF) $(STATIC_OBJECT_LIBXDP) $(LIBMK) $(LIB_OBJS) $(EXTRA_DEPS) $(EXTRA_USER_DEPS) $(QUIET_CC)$(CC) -Wall $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o $@ $(LIB_OBJS) \ $< $(USER_EXTRA_C) $(STATIC_LDLIBS) $(BPF_OBJS): %.o: %.c $(BPF_HEADERS) $(LIBMK) $(EXTRA_DEPS) $(QUIET_CLANG)$(CLANG) -target $(BPF_TARGET) $(BPF_CFLAGS) -O2 -c -g -o $@ $< run: all $(Q)env CC="$(CC)" CFLAGS="$(CFLAGS) $(LDFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDLIBS="$(LDLIBS)" V=$(V) $(TEST_RUNNER) $(TEST_FILE) $(RUN_TESTS) xdp-tools-1.6.1/lib/libxdp/tests/check_kern_compat.c000066400000000000000000000003261514310632100224420ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #include "test_utils.h" #include "../libxdp_internal.h" int main(__unused int argc, __unused char** argv) { silence_libbpf_logging(); return libxdp_check_kern_compat(); } xdp-tools-1.6.1/lib/libxdp/tests/test-libxdp.sh000066400000000000000000000100121514310632100214230ustar00rootroot00000000000000# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) ALL_TESTS="test_link_so test_link_a test_old_dispatcher test_xdp_devbound test_xdp_frags test_xsk_prog_refcnt_bpffs test_xsk_prog_refcnt_legacy test_xsk_non_privileged test_link_detach test_xsk_umem_flags" TESTS_DIR=$(dirname "${BASH_SOURCE[0]}") skip_if_missing_libxdp_compat() { if ! $TESTS_DIR/check_kern_compat; then exit "$SKIPPED_TEST" fi } test_link_so() { [ -n "${CC:-}" ] || return $SKIPPED_TEST TMPDIR=$(mktemp --tmpdir -d libxdp-test.XXXXXX) cat >$TMPDIR/libxdptest.c < int main(int argc, char **argv) { (void) argc; (void) argv; (void) xdp_program__open_file("filename", "section_name", NULL); return 0; } EOF $CC -o $TMPDIR/libxdptest $TMPDIR/libxdptest.c $CFLAGS $CPPFLAGS -lxdp $LDLIBS 2>&1 retval=$? rm -rf "$TMPDIR" return $retval } test_link_a() { [ -n "${CC:-}" ] || return $SKIPPED_TEST TMPDIR=$(mktemp --tmpdir -d libxdp-test.XXXXXX) cat >$TMPDIR/libxdptest.c < int main(int argc, char **argv) { (void) argc; (void) argv; (void) xdp_program__open_file("filename", "section_name", NULL); return 0; } EOF $CC -o $TMPDIR/libxdptest $TMPDIR/libxdptest.c $CFLAGS $CPPFLAGS -l:libxdp.a $LDLIBS 2>&1 retval=$? rm -rf "$TMPDIR" return $retval } test_refcnt_once() { # We need multiple queues for this test NUM_QUEUES_REQUIRED=3 ip link add xsk_veth0 numrxqueues $NUM_QUEUES_REQUIRED type veth peer name xsk_veth1 check_run $TESTS_DIR/test_xsk_refcnt xsk_veth0 2>&1 ip link delete xsk_veth0 } check_mount_bpffs() { mount | grep -q /sys/fs/bpf || mount -t bpf bpf /sys/fs/bpf/ || echo "Unable to mount /sys/fs/bpf" mount | grep -q /sys/fs/bpf } check_unmount_bpffs() { while mount | grep -q /sys/fs/bpf; do umount /sys/fs/bpf/ || break done ! mount | grep -q /sys/fs/bpf } test_xsk_prog_refcnt_bpffs() { check_mount_bpffs && test_refcnt_once "$@" } test_xsk_prog_refcnt_legacy() { check_unmount_bpffs && test_refcnt_once "$@" } test_xdp_frags() { check_mount_bpffs || return 1 skip_if_missing_libxdp_compat ip link add xdp_veth_big0 mtu 5000 type veth peer name xdp_veth_big1 mtu 5000 ip link add xdp_veth_small0 type veth peer name xdp_veth_small1 check_run $TESTS_DIR/test_xdp_frags xdp_veth_big0 xdp_veth_small0 2>&1 ip link delete xdp_veth_big0 ip link delete xdp_veth_small0 } test_xdp_devbound() { check_mount_bpffs || return 1 skip_if_missing_libxdp_compat ip link add xdp_veth0 type veth peer name xdp_veth1 ip link add xdp_veth2 type veth peer name xdp_veth3 check_run $TESTS_DIR/test_xdp_devbound xdp_veth1 xdp_veth3 2>&1 ip link delete xdp_veth0 ip link delete xdp_veth2 } test_old_dispatcher() { check_mount_bpffs || return 1 skip_if_missing_libxdp_compat export LIBXDP_OBJECT_PATH=$TESTS_DIR ip link add xdp_veth0 type veth peer name xdp_veth1 check_run $TESTS_DIR/test_dispatcher_versions xdp_veth0 ip link delete xdp_veth0 } test_xsk_non_privileged() { if test ! -f $TESTS_DIR/test_xsk_non_privileged; then exit "$SKIPPED_TEST" fi ip link add xdp_veth0 type veth peer name xdp_veth1 check_run $TESTS_DIR/test_xsk_non_privileged xdp_veth0 xdp_veth1 ip link delete xdp_veth0 } test_link_detach() { if test ! -f $TESTS_DIR/test_link_detach; then exit "$SKIPPED_TEST" fi export LIBXDP_OBJECT_PATH=$TESTS_DIR ip link add xdp_veth0 type veth peer name xdp_veth1 check_run $TESTS_DIR/test_link_detach xdp_veth0 ip link delete xdp_veth0 } test_xsk_umem_flags() { ip link add xdp_veth0 type veth peer name xdp_veth1 check_run $TESTS_DIR/test_xsk_umem_flags xdp_veth0 ip link delete xdp_veth0 } cleanup_tests() { ip link del dev xdp_veth_big0 >/dev/null 2>&1 ip link del dev xdp_veth_small0 >/dev/null 2>&1 ip link del dev xsk_veth0 >/dev/null 2>&1 ip link del dev xdp_veth0 >/dev/null 2>&1 } xdp-tools-1.6.1/lib/libxdp/tests/test_dispatcher_versions.c000066400000000000000000000202761514310632100241260ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include "test_utils.h" #include "../libxdp_internal.h" #include "xdp_dispatcher.h" #include #include #include #ifndef PATH_MAX #define PATH_MAX 4096 #endif #define BPFFS_DIR "/sys/fs/bpf/xdp" #define PROG_RUN_PRIO 42 #define PROG_CHAIN_CALL_ACTIONS (1 << XDP_DROP) #define DISPATCHER_V1_FILE "xdp_dispatcher_v1.o" #define DISPATCHER_V2_FILE "xdp_dispatcher_v2.o" #ifndef HAVE_LIBBPF_BPF_OBJECT__NEXT_MAP static struct bpf_map *bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *map) { return bpf_map__next(map, obj); } #endif static void print_test_result(const char *func, int ret) { fflush(stderr); fprintf(stderr, "%s:\t%s\n", func, ret ? "FAILED" : "PASSED"); fflush(stdout); } int get_prog_id(int prog_fd) { struct bpf_prog_info info = {}; __u32 len = sizeof(info); int err; err = bpf_obj_get_info_by_fd(prog_fd, &info, &len); if (err) return -errno; return info.id; } static char* get_dispatcher_file(unsigned int dispatcher_version) { switch (dispatcher_version) { case XDP_DISPATCHER_VERSION_V1: return DISPATCHER_V1_FILE; case XDP_DISPATCHER_VERSION_V2: return DISPATCHER_V2_FILE; default: break; } return NULL; } int load_dispatcher(int ifindex, unsigned int dispatcher_version) { struct xdp_dispatcher_config_v1 dispatcher_config_v1 = {}; struct xdp_dispatcher_config_v2 dispatcher_config_v2 = {}; char *dispatcher_file = get_dispatcher_file(dispatcher_version); struct bpf_object *obj_dispatcher, *obj_prog = NULL; DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); struct bpf_program *dispatcher_prog, *xdp_prog; int ret = 0, btf_id, lfd = -1, dispatcher_id; char pin_path[PATH_MAX], buf[PATH_MAX]; const char *attach_func = "prog0"; struct bpf_map *map; if (!ifindex || !dispatcher_file) return -ENOENT; obj_dispatcher = open_bpf_file(dispatcher_file, NULL); if (IS_ERR_OR_NULL(obj_dispatcher)) return -errno; btf_id = btf__find_by_name_kind(bpf_object__btf(obj_dispatcher), attach_func, BTF_KIND_FUNC); if (btf_id <= 0) { ret = -ENOENT; goto out; } opts.target_btf_id = btf_id; map = bpf_object__next_map(obj_dispatcher, NULL); if (!map) { ret = -ENOENT; goto out; } dispatcher_prog = bpf_object__find_program_by_name(obj_dispatcher, "xdp_dispatcher"); if (!dispatcher_prog) { ret = -errno; goto out; } switch (dispatcher_version) { case XDP_DISPATCHER_VERSION_V1: dispatcher_config_v1.num_progs_enabled = 1; dispatcher_config_v1.chain_call_actions[0] = PROG_CHAIN_CALL_ACTIONS; dispatcher_config_v1.run_prios[0] = PROG_RUN_PRIO; ret = bpf_map__set_initial_value(map, &dispatcher_config_v1, sizeof(dispatcher_config_v1)); break; case XDP_DISPATCHER_VERSION_V2: dispatcher_config_v2.magic = XDP_DISPATCHER_MAGIC; dispatcher_config_v2.num_progs_enabled = 1; dispatcher_config_v2.chain_call_actions[0] = PROG_CHAIN_CALL_ACTIONS; dispatcher_config_v2.run_prios[0] = PROG_RUN_PRIO; dispatcher_config_v2.is_xdp_frags = 0; dispatcher_config_v2.program_flags[0] = 0; dispatcher_config_v2.dispatcher_version = XDP_DISPATCHER_VERSION_V2; ret = bpf_map__set_initial_value(map, &dispatcher_config_v2, sizeof(dispatcher_config_v2)); } if (ret) goto out; ret = bpf_object__load(obj_dispatcher); if (ret) goto out; dispatcher_id = get_prog_id(bpf_program__fd(dispatcher_prog)); if (dispatcher_id < 0) { ret = dispatcher_id; goto out; } obj_prog = open_bpf_file("xdp_pass.o", NULL); if (!obj_prog) { ret = -errno; goto out; } xdp_prog = bpf_object__find_program_by_name(obj_prog, "xdp_pass"); if (!xdp_prog) { ret = -errno; goto out; } ret = bpf_program__set_attach_target(xdp_prog, bpf_program__fd(dispatcher_prog), attach_func); if (ret) goto out; bpf_program__set_type(xdp_prog, BPF_PROG_TYPE_EXT); bpf_program__set_expected_attach_type(xdp_prog, 0); ret = bpf_object__load(obj_prog); if (ret) goto out; lfd = bpf_link_create(bpf_program__fd(xdp_prog), bpf_program__fd(dispatcher_prog), 0, &opts); if (lfd < 0) { ret = -errno; goto out; } ret = try_snprintf(pin_path, sizeof(pin_path), "%s/dispatch-%d-%d", BPFFS_DIR, ifindex, dispatcher_id); if (ret) goto out; ret = mkdir(BPFFS_DIR, S_IRWXU); if (ret && errno != EEXIST) { ret = -errno; printf("mkdir err (%s): %s\n", BPFFS_DIR, strerror(-ret)); goto out; } ret = mkdir(pin_path, S_IRWXU); if (ret) { ret = -errno; printf("mkdir err (%s): %s\n", pin_path, strerror(-ret)); goto out; } ret = try_snprintf(buf, sizeof(buf), "%s/prog0-link", pin_path); if (ret) goto err_unpin; ret = bpf_obj_pin(lfd, buf); if (ret) goto err_unpin; ret = try_snprintf(buf, sizeof(buf), "%s/prog0-prog", pin_path); if (ret) goto err_unpin; ret = bpf_obj_pin(bpf_program__fd(xdp_prog), buf); if (ret) goto err_unpin; ret = xdp_attach_fd(bpf_program__fd(dispatcher_prog), -1, ifindex, XDP_MODE_NATIVE); if (ret) goto err_unpin; out: if (lfd >= 0) close(lfd); bpf_object__close(obj_dispatcher); bpf_object__close(obj_prog); return ret; err_unpin: if (!try_snprintf(buf, sizeof(buf), "%s/prog0-link", pin_path)) unlink(buf); if (!try_snprintf(buf, sizeof(buf), "%s/prog0-prog", pin_path)) unlink(buf); rmdir(pin_path); goto out; } int check_old_dispatcher(int ifindex, unsigned int dispatcher_version) { struct xdp_multiprog *mp = NULL; struct xdp_program *xdp_prog; char buf[100]; int ret; ret = load_dispatcher(ifindex, dispatcher_version); if (ret) goto out; mp = xdp_multiprog__get_from_ifindex(ifindex); ret = libxdp_get_error(mp); if (ret) goto out; if (xdp_multiprog__is_legacy(mp)) { printf("Got unexpected legacy multiprog\n"); ret = -EINVAL; goto out; } if (xdp_multiprog__program_count(mp) != 1) { printf("Expected 1 attached program, got %d\n", xdp_multiprog__program_count(mp)); ret = -EINVAL; goto out; } xdp_prog = xdp_multiprog__next_prog(NULL, mp); if (!xdp_prog) { ret = -errno; goto out; } if (strcmp(xdp_program__name(xdp_prog), "xdp_pass")) { printf("Expected xdp_pass program, got %s\n", xdp_program__name(xdp_prog)); ret = -EINVAL; goto out; } if (xdp_program__run_prio(xdp_prog) != PROG_RUN_PRIO) { printf("Expected run prio %d got %d\n", PROG_RUN_PRIO, xdp_program__run_prio(xdp_prog)); ret = -EINVAL; goto out; } ret = xdp_program__print_chain_call_actions(xdp_prog, buf, sizeof(buf)); if (ret) goto out; if (strcmp(buf, "XDP_DROP")) { printf("Expected actions XDP_PASS, got %s\n", buf); ret = -EINVAL; goto out; } DECLARE_LIBXDP_OPTS(xdp_program_opts, pass_opts); pass_opts.prog_name = "xdp_pass"; pass_opts.find_filename = "xdp-dispatcher.o"; xdp_prog = xdp_program__create(&pass_opts); ret = libxdp_get_error(xdp_prog); if (ret) goto out; ret = xdp_program__attach(xdp_prog, ifindex, XDP_MODE_NATIVE, 0); xdp_program__close(xdp_prog); if (!ret) { printf("Shouldn't have been able to attach a new program to ifindex!\n"); ret = -EINVAL; goto out; } ret = 0; out: if (mp) xdp_multiprog__detach(mp); xdp_multiprog__close(mp); return ret; } static void usage(char *progname) { fprintf(stderr, "Usage: %s \n", progname); exit(EXIT_FAILURE); } int check_old_dispatcher_v1(int ifindex) { int ret = check_old_dispatcher(ifindex, XDP_DISPATCHER_VERSION_V1); print_test_result(__func__, ret); return ret; } int check_old_dispatcher_v2(int ifindex) { int ret = check_old_dispatcher(ifindex, XDP_DISPATCHER_VERSION_V2); print_test_result(__func__, ret); return ret; } int main(int argc, char **argv) { int ifindex, ret; char *envval; envval = secure_getenv("VERBOSE_TESTS"); silence_libbpf_logging(); if (envval && envval[0] == '1') verbose_libxdp_logging(); else silence_libxdp_logging(); if (argc != 2) usage(argv[0]); ifindex = if_nametoindex(argv[1]); ret = check_old_dispatcher_v1(ifindex); ret = check_old_dispatcher_v2(ifindex) || ret; return ret; } xdp-tools-1.6.1/lib/libxdp/tests/test_link_detach.c000066400000000000000000000056251514310632100223160ustar00rootroot00000000000000 /* SPDX-License-Identifier: GPL-2.0 */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include "test_utils.h" #include #include #define SKIPPED_TEST 249 // needs to match SKIPPED_TEST value in test_runner.sh static void usage(char *progname) { fprintf(stderr, "Usage: %s \n", progname); exit(EXIT_FAILURE); } static int check_link_detach(int ifindex, enum xdp_attach_mode mode) { DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); struct bpf_object *obj_prog = NULL; struct bpf_program *prog; struct xdp_multiprog *mp = NULL; int ret, prog_fd, link_fd =0; if (!ifindex) return -EINVAL; obj_prog = open_bpf_file("xdp_pass.o", NULL); if (!obj_prog) { ret = -errno; goto out; } prog = bpf_object__find_program_by_name(obj_prog, "xdp_pass"); if (!prog) { ret = -errno; goto out; } ret = bpf_object__load(obj_prog); if (ret) { ret = -errno; fprintf(stderr, "Couldn't load object: %s\n", strerror(-ret)); goto out; } prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { ret = -errno; fprintf(stderr, "Couldn't get prog fd: %s\n", strerror(-ret)); goto out; } if (mode == XDP_MODE_SKB) opts.flags = XDP_FLAGS_SKB_MODE; link_fd = bpf_link_create(prog_fd, ifindex, BPF_XDP, &opts); if (link_fd < 0) { ret = SKIPPED_TEST; fprintf(stderr, "Couldn't attach XDP prog to ifindex %d: %s\n", ifindex, strerror(errno)); goto out; } mp = xdp_multiprog__get_from_ifindex(ifindex); ret = libxdp_get_error(mp); if (ret) { fprintf(stderr, "Couldn't get multiprog on ifindex %d: %s\n", ifindex, strerror(-ret)); goto out; } ret = xdp_multiprog__detach(mp); out: if (link_fd > 0) close(link_fd); xdp_multiprog__close(mp); bpf_object__close(obj_prog); return ret; } int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; int ifindex, ret; if (setrlimit(RLIMIT_MEMLOCK, &r)) { fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", strerror(errno)); exit(EXIT_FAILURE); } char *envval; envval = secure_getenv("VERBOSE_TESTS"); silence_libbpf_logging(); if (envval && envval[0] == '1') verbose_libxdp_logging(); else silence_libxdp_logging(); if (argc != 2) usage(argv[0]); ifindex = if_nametoindex(argv[1]); if (!ifindex) { fprintf(stderr, "Interface '%s' not found.\n", argv[1]); usage(argv[0]); } ret = check_link_detach(ifindex, XDP_MODE_SKB); if (ret) { fprintf(stderr, "Failed to detach XDP prog from ifindex %d mode %s: %s\n", ifindex, "XDP_MODE_SKB", strerror(-ret)); return ret; } ret = check_link_detach(ifindex, XDP_MODE_NATIVE); if (ret) { fprintf(stderr, "Failed to detach XDP prog from ifindex %d mode %s: %s\n", ifindex, "XDP_MODE_NATIVE", strerror(-ret)); } return ret; } xdp-tools-1.6.1/lib/libxdp/tests/test_runner.sh000077500000000000000000000051011514310632100215420ustar00rootroot00000000000000#!/bin/bash # SPDX-License-Identifier: GPL-2.0-or-later # # Script to setup and manage tests for xdp-tools. # Based on the test-env script from xdp-tutorial. # # Author: Toke Høiland-Jørgensen (toke@redhat.com) # Date: 26 May 2020 # Copyright (c) 2020 Red Hat set -o errexit set -o nounset umask 077 TEST_PROG_DIR="${TEST_PROG_DIR:-$(dirname "${BASH_SOURCE[0]}")}" ALL_TESTS="" VERBOSE_TESTS=${V:-0} export VERBOSE_TESTS # Odd return value for skipping, as only 0-255 is valid. SKIPPED_TEST=249 is_func() { type "$1" 2>/dev/null | grep -q 'is a function' } check_run() { local ret [ "$VERBOSE_TESTS" -eq "1" ] && echo "$@" "$@" ret=$? if [ "$ret" -ne "0" ]; then exit $ret fi } exec_test() { local testn="$1" local output local ret local prefix prefix=$(printf " %-30s" "[$testn]") if ! is_func "$testn"; then echo "${prefix}INVALID" return 1 fi if [ "$VERBOSE_TESTS" -eq "1" ]; then echo "${prefix}START:" ($testn 2>&1) | sed -u 's/^/ /' ret=${PIPESTATUS[0]} echo " Test $testn exited with return code: $ret" else echo -n "$prefix" output=$($testn 2>&1) ret=$? prefix= fi if [ "$ret" -eq "0" ]; then echo "${prefix}PASS" elif [ "$ret" -eq "$SKIPPED_TEST" ]; then echo "${prefix}SKIPPED" ret=0 else echo "${prefix}FAIL" fi if [ "$ret" -ne "0" ] && [ "$VERBOSE_TESTS" -ne "1" ]; then echo "$output" | sed 's/^/ /' echo " Test $testn exited with return code: $ret" fi return $ret } run_tests() { local TESTS="$*" local ret=0 [ -z "$TESTS" ] && TESTS="$ALL_TESTS" echo " Running tests from $TEST_DEFINITIONS" for testn in $TESTS; do exec_test $testn || ret=1 if is_func cleanup_tests; then cleanup_tests || true fi done return $ret } usage() { echo "Usage: $0 [test names]" >&2 exit 1 } if [ "$EUID" -ne "0" ]; then if command -v sudo >/dev/null 2>&1; then exec sudo env CC="$CC" CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" LDLIBS="$LDLIBS" V=${VERBOSE_TESTS} "$0" "$@" else die "Tests must be run as root" fi else if [ "${DID_UNSHARE:-0}" -ne "1" ]; then echo "Executing tests in separate net- and mount namespaces" >&2 exec env DID_UNSHARE=1 unshare -n -m "$0" "$@" fi fi TEST_DEFINITIONS="${1:-}" [ -f "$TEST_DEFINITIONS" ] || usage source "$TEST_DEFINITIONS" shift run_tests "$@" xdp-tools-1.6.1/lib/libxdp/tests/test_utils.c000066400000000000000000000027571514310632100212140ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define _GNU_SOURCE #include #include #include #include #include "test_utils.h" #include /* ERR_PTR */ static int try_snprintf(char *buf, size_t buf_len, const char *format, ...) { va_list args; int len; va_start(args, format); len = vsnprintf(buf, buf_len, format, args); va_end(args); if (len < 0) return -EINVAL; else if ((size_t)len >= buf_len) return -ENAMETOOLONG; return 0; } static bool try_bpf_file(char *buf, size_t buf_size, char *path, const char *progname) { struct stat sb = {}; if (try_snprintf(buf, buf_size, "%s/%s", path, progname)) return false; if (stat(buf, &sb)) return false; return true; } int find_bpf_file(char *buf, size_t buf_size, const char *progname) { static char *bpf_obj_paths[] = { #ifdef DEBUG ".", #endif BPF_OBJECT_PATH, NULL }; char *path, **p; path = secure_getenv(XDP_OBJECT_ENVVAR); if (path && try_bpf_file(buf, buf_size, path, progname)) { return 0; } else if (!path) { for (p = bpf_obj_paths; *p; p++) if (try_bpf_file(buf, buf_size, *p, progname)) return 0; } fprintf(stderr, "Couldn't find a BPF file with name %s\n", progname); return -ENOENT; } struct bpf_object *open_bpf_file(const char *progname, struct bpf_object_open_opts *opts) { char buf[PATH_MAX]; int err; err = find_bpf_file(buf, sizeof(buf), progname); if (err) return ERR_PTR(err); return bpf_object__open_file(buf, opts); } xdp-tools-1.6.1/lib/libxdp/tests/test_utils.h000066400000000000000000000023131514310632100212050ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __TEST_UTILS_H #define __TEST_UTILS_H #include #include #ifndef BPF_OBJECT_PATH #define BPF_OBJECT_PATH "/usr/lib/bpf" #endif #define __unused __attribute__((unused)) static int libbpf_silent_func(__unused enum libbpf_print_level level, __unused const char *format, __unused va_list args) { return 0; } static inline void silence_libbpf_logging(void) { libbpf_set_print(libbpf_silent_func); } static int libxdp_silent_func(__unused enum libxdp_print_level level, __unused const char *format, __unused va_list args) { return 0; } static int libxdp_verbose_func(__unused enum libxdp_print_level level, __unused const char *format, __unused va_list args) { fprintf(stderr, " "); vfprintf(stderr, format, args); return 0; } static inline void silence_libxdp_logging(void) { libxdp_set_print(libxdp_silent_func); } static inline void verbose_libxdp_logging(void) { libxdp_set_print(libxdp_verbose_func); } int find_bpf_file(char *buf, size_t buf_size, const char *progname); struct bpf_object *open_bpf_file(const char *progname, struct bpf_object_open_opts *opts); #endif xdp-tools-1.6.1/lib/libxdp/tests/test_xdp_devbound.c000066400000000000000000000223701514310632100225260ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include "test_utils.h" #include #include #define ARRAY_SIZE(_x) (sizeof(_x) / sizeof((_x)[0])) #define EXIT_SKIPPED 249 static bool kern_compat; static struct xdp_program *load_prog(void) { DECLARE_LIBXDP_OPTS(xdp_program_opts, opts, .prog_name = "xdp_pass", .find_filename = "xdp-dispatcher.o", ); return xdp_program__create(&opts); } static int check_attached_progs(int ifindex, int count, bool devbound) { struct xdp_multiprog *mp; int ret; mp = xdp_multiprog__get_from_ifindex(ifindex); ret = libxdp_get_error(mp); if (ret) { fprintf(stderr, "Couldn't get multiprog on ifindex %d: %s\n", ifindex, strerror(-ret)); return ret; } ret = -EINVAL; if (xdp_multiprog__is_legacy(mp)) { fprintf(stderr, "Found legacy prog on ifindex %d\n", ifindex); goto out; } if (xdp_multiprog__program_count(mp) != count) { fprintf(stderr, "Expected %d programs loaded on ifindex %d, found %d\n", count, ifindex, xdp_multiprog__program_count(mp)); goto out; } if (xdp_multiprog__xdp_dev_bound(mp) != devbound) { fprintf(stderr, "Multiprog on ifindex %d %s device binding, expected %s\n", ifindex, xdp_multiprog__xdp_dev_bound(mp) ? "supports" : "does not support", devbound ? "support" : "no support"); goto out; } ret = 0; out: xdp_multiprog__close(mp); return ret; } static void print_test_result(const char *func, int ret) { fflush(stderr); fprintf(stderr, "%s:\t%s\n", func, ret ? (ret == EXIT_SKIPPED ? "SKIPPED" : "FAILED") : "PASSED"); fflush(stdout); } static int load_attach_prog(struct xdp_program **prog, int ifindex, bool devbound) { int ret; *prog = load_prog(); if (!*prog) { ret = -errno; fprintf(stderr, "Couldn't load program: %s\n", strerror(-ret)); return ret; } return xdp_program__attach(*prog, ifindex, XDP_MODE_NATIVE, devbound ? XDP_ATTACH_DEVBIND : 0); } static int _check_load(int ifindex, bool devbound, bool should_succeed) { struct xdp_program *prog = NULL; bool attached = false; int ret; if (!kern_compat && devbound) { ret = EXIT_SKIPPED; goto out; } ret = load_attach_prog(&prog, ifindex, devbound); attached = !ret; if (attached != should_succeed) { ret = -EINVAL; goto out; } if (should_succeed) ret = check_attached_progs(ifindex, 1, devbound); else ret = 0; out: if (attached) xdp_program__detach(prog, ifindex, XDP_MODE_NATIVE, 0); xdp_program__close(prog); return ret; } static int check_load_devbound(int ifindex) { int ret = _check_load(ifindex, true, true); print_test_result(__func__, ret); return ret; } static int check_load_nodevbound_success(int ifindex) { int ret = _check_load(ifindex, false, true); print_test_result(__func__, ret); return ret; } static int check_load_devbound_multi(int ifindex) { struct xdp_program *prog1 = NULL, *prog2 = NULL; int ret; if (!kern_compat) { ret = EXIT_SKIPPED; goto out; } ret = load_attach_prog(&prog1, ifindex, true); if (ret) goto out; ret = load_attach_prog(&prog2, ifindex, true); if (ret) goto out_prog1; ret = check_attached_progs(ifindex, 2, true); xdp_program__detach(prog2, ifindex, XDP_MODE_NATIVE, 0); out_prog1: xdp_program__detach(prog1, ifindex, XDP_MODE_NATIVE, 0); out: xdp_program__close(prog2); xdp_program__close(prog1); print_test_result(__func__, ret); return ret; } static int _check_load_mix(int ifindex, bool devbound1, bool devbound2) { struct xdp_program *prog1 = NULL, *prog2 = NULL; int ret; if (!kern_compat && (devbound1 || devbound2)) { ret = EXIT_SKIPPED; goto out; } ret = load_attach_prog(&prog1, ifindex, devbound1); if (ret) goto out; /* First program attached, dispatcher supports device binding */ ret = check_attached_progs(ifindex, 1, devbound1); if (ret) goto out; ret = load_attach_prog(&prog2, ifindex, devbound2); if (!ret) { xdp_program__detach(prog2, ifindex, XDP_MODE_NATIVE, 0); ret = -EINVAL; goto out_prog1; } /* Still only a single program loaded, with device binding */ ret = check_attached_progs(ifindex, 1, devbound1); out_prog1: xdp_program__detach(prog1, ifindex, XDP_MODE_NATIVE, 0); out: xdp_program__close(prog2); xdp_program__close(prog1); return ret; } static int check_load_mix_devbound_nodevbound(int ifindex) { int ret = _check_load_mix(ifindex, true, false); print_test_result(__func__, ret); return ret; } static int check_load_mix_nodevbound_devbound(int ifindex) { int ret = _check_load_mix(ifindex, false, true); print_test_result(__func__, ret); return ret; } static int check_load_devbound_multiple_ifindex(int ifindex1, int ifindex2) { struct xdp_program *prog = NULL; int ret; if (!kern_compat) { ret = EXIT_SKIPPED; goto out; } prog = load_prog(); ret = xdp_program__attach(prog, ifindex1, XDP_MODE_NATIVE, XDP_ATTACH_DEVBIND); if (ret) { ret = -EINVAL; goto out; } /* Still only a single program loaded, with device binding */ ret = check_attached_progs(ifindex1, 1, true); if (ret) goto out; ret = xdp_program__attach(prog, ifindex2, XDP_MODE_NATIVE, XDP_ATTACH_DEVBIND); if (!ret) { xdp_program__detach(prog, ifindex2, XDP_MODE_NATIVE, 0); ret = -EINVAL; goto out; } out: xdp_program__detach(prog, ifindex1, XDP_MODE_NATIVE, 0); xdp_program__close(prog); print_test_result(__func__, ret == EXIT_SKIPPED ? ret : !ret); return !ret; } static int check_load_mixed_multiple_ifindex(int ifindex1, int ifindex2) { struct xdp_program *prog = NULL; int ret; if (!kern_compat) { ret = EXIT_SKIPPED; goto out; } prog = load_prog(); ret = xdp_program__attach(prog, ifindex1, XDP_MODE_NATIVE, XDP_ATTACH_DEVBIND); if (ret) goto out; /* Still only a single program loaded, with device binding */ ret = check_attached_progs(ifindex1, 1, true); if (ret) goto out_prog1; ret = xdp_program__attach(prog, ifindex2, XDP_MODE_NATIVE, 0); if (!ret) { xdp_program__detach(prog, ifindex2, XDP_MODE_NATIVE, 0); ret = -EINVAL; } out_prog1: xdp_program__detach(prog, ifindex1, XDP_MODE_NATIVE, 0); out: xdp_program__close(prog); print_test_result(__func__, ret == EXIT_SKIPPED ? ret : !ret); return !ret; } static int check_load2_mixed_multiple_ifindex(int ifindex1, int ifindex2) { struct xdp_program *prog1 = NULL, *prog2 = NULL; int ret; if (!kern_compat) { ret = EXIT_SKIPPED; goto out; } ret = load_attach_prog(&prog1, ifindex1, true); if (ret) goto out; /* First program attached, dispatcher supports device binding */ ret = check_attached_progs(ifindex1, 1, true); if (ret) goto out_prog1; ret = load_attach_prog(&prog2, ifindex2, false); if (ret) goto out_prog1; /* Still only a single program loaded, with device binding */ ret = check_attached_progs(ifindex2, 1, false); out_prog1: xdp_program__detach(prog1, ifindex1, XDP_MODE_NATIVE, 0); out: xdp_program__detach(prog2, ifindex2, XDP_MODE_NATIVE, 0); xdp_program__close(prog2); xdp_program__close(prog1); print_test_result(__func__, ret); return ret; } static bool check_devbound_compat(void) { #ifdef HAVE_LIBBPF_BPF_PROGRAM__FLAGS struct xdp_program *test_prog; struct bpf_program *prog; struct bpf_object *obj; bool ret = false; int err; test_prog = load_prog(); if (!test_prog) return false; obj = xdp_program__bpf_obj(test_prog); if (!obj) goto out; prog = bpf_object__find_program_by_name(obj, "xdp_pass"); if (!prog) goto out; bpf_program__set_flags(prog, BPF_F_XDP_DEV_BOUND_ONLY); err = bpf_object__load(obj); if (!err) { printf("Kernel supports XDP programs with device binding\n"); ret = true; } else { printf("Kernel DOES NOT support XDP programs with device binding\n"); } fflush(stdout); out: xdp_program__close(test_prog); return ret; #else return -EOPNOTSUPP; #endif } static void usage(char *progname) { fprintf(stderr, "Usage: %s \n", progname); exit(EXIT_FAILURE); } int main(int argc, char **argv) { struct rlimit r = { RLIM_INFINITY, RLIM_INFINITY }; int ifindex1, ifindex2, ret = 0; if (setrlimit(RLIMIT_MEMLOCK, &r)) { fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", strerror(errno)); exit(EXIT_FAILURE); } char *envval; envval = secure_getenv("VERBOSE_TESTS"); silence_libbpf_logging(); if (envval && envval[0] == '1') verbose_libxdp_logging(); else silence_libxdp_logging(); if (argc != 3) usage(argv[0]); ifindex1 = if_nametoindex(argv[1]); if (!ifindex1) { fprintf(stderr, "Interface '%s' not found.\n", argv[1]); usage(argv[0]); } ifindex2 = if_nametoindex(argv[2]); if (!ifindex2) { fprintf(stderr, "Interface '%s' not found.\n", argv[1]); usage(argv[0]); } kern_compat = check_devbound_compat(); ret = check_load_devbound(ifindex1); ret |= check_load_nodevbound_success(ifindex1); ret |= check_load_devbound_multi(ifindex1); ret |= check_load_mix_devbound_nodevbound(ifindex1); ret |= check_load_mix_nodevbound_devbound(ifindex1); ret |= check_load_devbound_multiple_ifindex(ifindex1, ifindex2); ret |= check_load_mixed_multiple_ifindex(ifindex1, ifindex2); ret |= check_load2_mixed_multiple_ifindex(ifindex1, ifindex2); return ret == EXIT_SKIPPED ? 0 : ret; } xdp-tools-1.6.1/lib/libxdp/tests/test_xdp_frags.c000066400000000000000000000207571514310632100220310ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include "test_utils.h" #include #include # define ARRAY_SIZE(_x) (sizeof(_x) / sizeof((_x)[0])) static bool kern_compat; static struct xdp_program *load_prog(void) { DECLARE_LIBXDP_OPTS(xdp_program_opts, opts, .prog_name = "xdp_pass", .find_filename = "xdp-dispatcher.o", ); return xdp_program__create(&opts); } static int check_attached_progs(int ifindex, int count, bool frags) { struct xdp_multiprog *mp; int ret; /* If the kernel does not support frags, we always expect * frags support to be disabled on a returned dispatcher */ if (!kern_compat) frags = false; mp = xdp_multiprog__get_from_ifindex(ifindex); ret = libxdp_get_error(mp); if (ret) { fprintf(stderr, "Couldn't get multiprog on ifindex %d: %s\n", ifindex, strerror(-ret)); return ret; } ret = -EINVAL; if (xdp_multiprog__is_legacy(mp)) { fprintf(stderr, "Found legacy prog on ifindex %d\n", ifindex); goto out; } if (xdp_multiprog__program_count(mp) != count) { fprintf(stderr, "Expected %d programs loaded on ifindex %d, found %d\n", count, ifindex, xdp_multiprog__program_count(mp)); goto out; } if (xdp_multiprog__xdp_frags_support(mp) != frags) { fprintf(stderr, "Multiprog on ifindex %d %s frags, expected %s\n", ifindex, xdp_multiprog__xdp_frags_support(mp) ? "supports" : "does not support", frags ? "support" : "no support"); goto out; } ret = 0; out: xdp_multiprog__close(mp); return ret; } static void print_test_result(const char *func, int ret) { fflush(stderr); fprintf(stderr, "%s:\t%s\n", func, ret ? "FAILED" : "PASSED"); fflush(stdout); } static int load_attach_prog(struct xdp_program **prog, int ifindex, bool frags) { int ret; *prog = load_prog(); if (!*prog) { ret = -errno; fprintf(stderr, "Couldn't load program: %s\n", strerror(-ret)); return ret; } ret = xdp_program__set_xdp_frags_support(*prog, frags); if (ret) return ret; return xdp_program__attach(*prog, ifindex, XDP_MODE_NATIVE, 0); } static int _check_load(int ifindex, bool frags, bool should_succeed) { struct xdp_program *prog = NULL; bool attached; int ret; ret = load_attach_prog(&prog, ifindex, frags); attached = !ret; if (attached != should_succeed) { ret = -EINVAL; goto out; } if (should_succeed) ret = check_attached_progs(ifindex, 1, frags); else ret = 0; out: if (attached) xdp_program__detach(prog, ifindex, XDP_MODE_NATIVE, 0); xdp_program__close(prog); return ret; } static int check_load_frags(int ifindex_bigmtu, int ifindex_smallmtu) { int ret = _check_load(ifindex_smallmtu, true, true); if (!ret && ifindex_bigmtu) _check_load(ifindex_bigmtu, true, true); print_test_result(__func__, ret); return ret; } static int check_load_nofrags_success(int ifindex) { int ret = _check_load(ifindex, false, true); print_test_result(__func__, ret); return ret; } static int check_load_nofrags_fail(int ifindex) { int ret = _check_load(ifindex, false, false); print_test_result(__func__, ret); return ret; } static int check_load_frags_multi(int ifindex) { struct xdp_program *prog1 = NULL, *prog2 = NULL; int ret; ret = load_attach_prog(&prog1, ifindex, true); if (ret) goto out; ret = load_attach_prog(&prog2, ifindex, true); if (ret) goto out_prog1; ret = check_attached_progs(ifindex, 2, true); xdp_program__detach(prog2, ifindex, XDP_MODE_NATIVE, 0); out_prog1: xdp_program__detach(prog1, ifindex, XDP_MODE_NATIVE, 0); out: xdp_program__close(prog2); xdp_program__close(prog1); print_test_result(__func__, ret); return ret; } static int check_load_mix_small(int ifindex) { struct xdp_program *prog1 = NULL, *prog2 = NULL; int ret; ret = load_attach_prog(&prog1, ifindex, true); if (ret) goto out; /* First program attached, dispatcher supports frags */ ret = check_attached_progs(ifindex, 1, true); if (ret) goto out; ret = load_attach_prog(&prog2, ifindex, false); if (ret) goto out_prog1; /* Mixed program attachment, dispatcher should not support frags */ ret = check_attached_progs(ifindex, 2, false); ret = xdp_program__detach(prog2, ifindex, XDP_MODE_NATIVE, 0) || ret; if (ret) goto out_prog1; /* Second program removed, back to frags-only */ ret = check_attached_progs(ifindex, 1, true) || ret; out_prog1: xdp_program__detach(prog1, ifindex, XDP_MODE_NATIVE, 0); out: xdp_program__close(prog2); xdp_program__close(prog1); print_test_result(__func__, ret); return ret; } static int check_load_mix_big(int ifindex) { struct xdp_program *prog1 = NULL, *prog2 = NULL; int ret; ret = load_attach_prog(&prog1, ifindex, true); if (ret) goto out; /* First program attached, dispatcher supports frags */ ret = check_attached_progs(ifindex, 1, true); if (ret) goto out; /* Second non-frags program should fail on big-MTU device */ ret = load_attach_prog(&prog2, ifindex, false); if (!ret) { xdp_program__detach(prog2, ifindex, XDP_MODE_NATIVE, 0); ret = -EINVAL; goto out_prog1; } /* Still only a single program loaded, with frags support */ ret = check_attached_progs(ifindex, 1, true); out_prog1: xdp_program__detach(prog1, ifindex, XDP_MODE_NATIVE, 0); out: xdp_program__close(prog2); xdp_program__close(prog1); print_test_result(__func__, ret); return ret; } static bool check_frags_compat(void) { #ifdef HAVE_LIBBPF_BPF_PROGRAM__FLAGS struct xdp_program *test_prog; struct bpf_program *prog; struct bpf_object *obj; bool ret = false; int err; test_prog = load_prog(); if (!test_prog) return false; obj = xdp_program__bpf_obj(test_prog); if (!obj) goto out; prog = bpf_object__find_program_by_name(obj, "xdp_pass"); if (!prog) goto out; bpf_program__set_flags(prog, BPF_F_XDP_HAS_FRAGS); err = bpf_object__load(obj); if (!err) { printf("Kernel supports XDP programs with frags\n"); ret = true; } else { printf("Kernel DOES NOT support XDP programs with frags\n"); } fflush(stdout); out: xdp_program__close(test_prog); return ret; #else return -EOPNOTSUPP; #endif } static void usage(char *progname) { fprintf(stderr, "Usage: %s \n", progname); exit(EXIT_FAILURE); } int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; int ifindex_bigmtu, ifindex_smallmtu, ret; if (setrlimit(RLIMIT_MEMLOCK, &r)) { fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", strerror(errno)); exit(EXIT_FAILURE); } char *envval; envval = secure_getenv("VERBOSE_TESTS"); silence_libbpf_logging(); if (envval && envval[0] == '1') verbose_libxdp_logging(); else silence_libxdp_logging(); if (argc != 3) usage(argv[0]); ifindex_bigmtu = if_nametoindex(argv[1]); ifindex_smallmtu = if_nametoindex(argv[2]); if (!ifindex_bigmtu || !ifindex_smallmtu) { fprintf(stderr, "Interface '%s' or '%s' not found.\n", argv[1], argv[2]); usage(argv[0]); } kern_compat = check_frags_compat(); ret = check_load_frags(kern_compat ? ifindex_bigmtu : 0, ifindex_smallmtu); ret = check_load_nofrags_success(ifindex_smallmtu) || ret; if (kern_compat) { ret = check_load_nofrags_fail(ifindex_bigmtu) || ret; ret = check_load_frags_multi(ifindex_bigmtu) || ret; ret = check_load_mix_big(ifindex_bigmtu) || ret; } ret = check_load_mix_small(ifindex_smallmtu) || ret; return ret; } xdp-tools-1.6.1/lib/libxdp/tests/test_xsk_non_privileged.c000066400000000000000000000116311514310632100237340ustar00rootroot00000000000000// SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) #include #include #include #include #include #include #include #include #include "test_utils.h" #include #include #define NUM_DESCS ((XSK_RING_PROD__DEFAULT_NUM_DESCS \ + XSK_RING_CONS__DEFAULT_NUM_DESCS) * 2) #define UMEM_SIZE (NUM_DESCS * XSK_UMEM__DEFAULT_FRAME_SIZE) static void run_privileged_operations(int ifindex, int queue_id, int *sock_fd) { int xsks_map_fd = -1; if (xsk_setup_xdp_prog(ifindex, &xsks_map_fd) || xsks_map_fd < 0) { perror("xsk_setup_xdp_prog failed"); exit(EXIT_FAILURE); } *sock_fd = socket(AF_XDP, SOCK_RAW, 0); if (*sock_fd < 0) { perror("socket(AF_XDP, ...) failed"); exit(EXIT_FAILURE); } /* This call requires extra capabilities in older kernels, so keeping * it in a privileged section. And it's not supported on even older * kernels, so not failing if that's the case. */ if (bpf_map_update_elem(xsks_map_fd, &queue_id, sock_fd, 0) && errno != EOPNOTSUPP) { perror("bpf_map_update_elem failed"); exit(EXIT_FAILURE); } close(xsks_map_fd); } static void update_rlimit_memlock(void) { struct rlimit rlim = { .rlim_cur = UMEM_SIZE, .rlim_max = UMEM_SIZE }; if (setrlimit(RLIMIT_MEMLOCK, &rlim)) { perror("setrlimit(RLIMIT_MEMLOCK) failed"); exit(EXIT_FAILURE); } } static void drop_capabilities(void) { if (capng_get_caps_process()) { perror("capng_get_caps_process failed"); exit(EXIT_FAILURE); } capng_clear(CAPNG_SELECT_BOTH); if (capng_apply(CAPNG_SELECT_BOTH)) { perror("capng_apply failed"); exit(EXIT_FAILURE); } } static void run_non_privileged_preconfig(const char *ifname, const char *ifname2, int sock_fd) { /* This call requires CAP_NET_RAW on kernels older than 5.7, * so not checking the result. It may fail or not, we do not * rely on that much. */ setsockopt(sock_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, strlen(ifname)); /* The second update should always fail because it always * requires CAP_NET_RAW. */ if (!setsockopt(sock_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname2, strlen(ifname2))) { perror("setsockopt(SO_BINDTODEVICE, ifname2) succeeded"); exit(EXIT_FAILURE); } } static struct xsk_umem *create_umem_non_privileged(int sock_fd) { struct xsk_umem *umem = NULL; struct xsk_ring_cons cq; struct xsk_ring_prod fq; void *b; if (posix_memalign(&b, getpagesize(), UMEM_SIZE)) { perror("posix_memalign failed"); exit(EXIT_FAILURE); } /* This variant requires CAP_NET_RAW, so should fail. */ DECLARE_LIBXDP_OPTS(xsk_umem_opts, opts_cap, .size = UMEM_SIZE, .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, ); umem = xsk_umem__create_opts(b, &fq, &cq, &opts_cap); if (umem) { perror("xsk_umem__create_opts succeeded"); exit(EXIT_FAILURE); } /* This variant shouldn't need any capabilities, so should pass. */ DECLARE_LIBXDP_OPTS(xsk_umem_opts, opts, .fd = sock_fd, .size = UMEM_SIZE, .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, ); umem = xsk_umem__create_opts(b, &fq, &cq, &opts); if (!umem) { perror("xsk_umem__create_opts failed"); exit(EXIT_FAILURE); } return umem; } static struct xsk_socket *create_xsk_non_privileged(const char *ifname, struct xsk_umem *umem, int queue_id) { struct xsk_socket *xsk = NULL; struct xsk_ring_cons rx; struct xsk_ring_prod tx; DECLARE_LIBXDP_OPTS(xsk_socket_opts, opts, .rx = &rx, .tx = &tx, .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, .libxdp_flags = XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD, .bind_flags = XDP_USE_NEED_WAKEUP, .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST, ); xsk = xsk_socket__create_opts(ifname, queue_id, umem, &opts); if (!xsk) { perror("xsk_socket__create_opts failed"); exit(EXIT_FAILURE); } return xsk; } int main(int argc, const char *argv[]) { const char *ifname, *ifname2; struct xsk_socket *xsk; struct xsk_umem *umem; int ifindex, queue_id; int sock_fd; silence_libbpf_logging(); if (argc < 3) { printf("Usage: %s \n", argv[0]); exit(EXIT_FAILURE); } update_rlimit_memlock(); ifname = argv[1]; ifname2 = argv[2]; queue_id = 0; ifindex = if_nametoindex(ifname); if (!ifindex) { perror("if_nametoindex(ifname) failed"); exit(EXIT_FAILURE); } if (!if_nametoindex(ifname2)) { perror("if_nametoindex(ifname2) failed"); exit(EXIT_FAILURE); } run_privileged_operations(ifindex, queue_id, &sock_fd); drop_capabilities(); run_non_privileged_preconfig(ifname, ifname2, sock_fd); umem = create_umem_non_privileged(sock_fd); xsk = create_xsk_non_privileged(ifname, umem, queue_id); xsk_socket__delete(xsk); return EXIT_SUCCESS; } xdp-tools-1.6.1/lib/libxdp/tests/test_xsk_refcnt.c000066400000000000000000000157441514310632100222220ustar00rootroot00000000000000// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) #include #include #include #include #include #include #include #include #include #include "test_utils.h" #include #include typedef __u64 u64; typedef __u32 u32; typedef __u16 u16; typedef __u8 u8; #define MAX_EVENTS 10 #define MAX_NUM_QUEUES 4 #define TEST_NAME_LENGTH 128 struct xsk_umem_info { struct xsk_ring_prod fq; struct xsk_ring_cons cq; struct xsk_umem *umem; void *buffer; }; struct xsk_socket_info { struct xsk_ring_cons rx; struct xsk_umem_info *umem; struct xsk_socket *xsk; }; /* Event holds socket operations that are run concurrently * and in theory can produce a race condition */ struct xsk_test_event { u32 num_create; u32 num_delete; u32 create_qids[MAX_NUM_QUEUES]; /* QIDs for sockets being created in this event */ u32 delete_qids[MAX_NUM_QUEUES]; /* QIDs for sockets being deleted in this event */ }; struct xsk_test { char name[TEST_NAME_LENGTH]; u32 num_events; struct xsk_test_event events[MAX_EVENTS]; }; /* Tests that use less queues must come first, * so we can run all possible tests on VMs with * small number of CPUs */ static struct xsk_test all_tests[] = { { "Single socket created and deleted", .num_events = 2, .events = {{ .num_create = 1, .create_qids = {0} }, { .num_delete = 1, .delete_qids = {0} } }}, { "2 sockets, created and deleted sequentially", .num_events = 4, .events = {{ .num_create = 1, .create_qids = {0} }, { .num_create = 1, .create_qids = {1} }, { .num_delete = 1, .delete_qids = {0} }, { .num_delete = 1, .delete_qids = {1} } }}, { "2 sockets, created sequentially and deleted asynchronously", .num_events = 3, .events = {{ .num_create = 1, .create_qids = {0} }, { .num_create = 1, .create_qids = {1} }, { .num_delete = 2, .delete_qids = {0, 1} } }}, { "2 sockets, asynchronously delete and create", .num_events = 3, .events = {{ .num_create = 1, .create_qids = {0} }, { .num_create = 1, .create_qids = {1}, .num_delete = 1, .delete_qids = {0} }, { .num_delete = 1, .delete_qids = {1} } }}, { "3 sockets, created and deleted sequentially", .num_events = 6, .events = {{ .num_create = 1, .create_qids = {0} }, { .num_create = 1, .create_qids = {1} }, { .num_create = 1, .create_qids = {2} }, { .num_delete = 1, .delete_qids = {1} }, { .num_delete = 1, .delete_qids = {2} }, { .num_delete = 1, .delete_qids = {0} } }}, }; # define ARRAY_SIZE(_x) (sizeof(_x) / sizeof((_x)[0])) static const char *opt_if; static const u8 num_tests = ARRAY_SIZE(all_tests); static struct xsk_socket_info *xsks[MAX_NUM_QUEUES]; #define FRAME_SIZE 64 #define NUM_FRAMES (XSK_RING_CONS__DEFAULT_NUM_DESCS * 2) static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) { struct xsk_umem_info *umem; umem = calloc(1, sizeof(*umem)); if (!umem) exit(EXIT_FAILURE); DECLARE_LIBXDP_OPTS(xsk_umem_opts, opts, .size = size, ); umem->umem = xsk_umem__create_opts(buffer, &umem->fq, &umem->cq, &opts); if (!umem->umem) exit(errno); umem->buffer = buffer; return umem; } static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem, unsigned int qid) { struct xsk_socket_info *xsk; struct xsk_ring_cons *rxr; xsk = calloc(1, sizeof(*xsk)); if (!xsk) exit(EXIT_FAILURE); xsk->umem = umem; rxr = &xsk->rx; DECLARE_LIBXDP_OPTS(xsk_socket_opts, opts, .rx = rxr, .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, ); xsk->xsk = xsk_socket__create_opts(opt_if, qid, umem->umem, &opts); return xsk; } static void *create_socket(void *args) { struct xsk_umem_info *umem; u32 qid = *(u32 *)args; void *buffs; if (posix_memalign(&buffs, getpagesize(), /* PAGE_SIZE aligned */ NUM_FRAMES * FRAME_SIZE)) { fprintf(stderr, "ERROR: Can't allocate buffer memory \"%s\"\n", strerror(errno)); exit(EXIT_FAILURE); } umem = xsk_configure_umem(buffs, NUM_FRAMES * FRAME_SIZE); xsks[qid] = xsk_configure_socket(umem, qid); return NULL; } static void *delete_socket(void *args) { u32 qid = *(u32 *)args; struct xsk_umem *umem; void *buff; buff = xsks[qid]->umem->buffer; umem = xsks[qid]->umem->umem; xsk_socket__delete(xsks[qid]->xsk); free(buff); (void)xsk_umem__delete(umem); return NULL; } static bool xsk_prog_attached(void) { char xsk_prog_name[] = "xsk_def_prog"; int ifindex = if_nametoindex(opt_if); struct xdp_program *xsk_prog; struct xdp_multiprog *mp; bool answer = false; mp = xdp_multiprog__get_from_ifindex(ifindex); if (IS_ERR_OR_NULL(mp)) return false; xsk_prog = xdp_multiprog__is_legacy(mp) ? xdp_multiprog__main_prog(mp) : xdp_multiprog__next_prog(NULL, mp); if (IS_ERR_OR_NULL(xsk_prog)) goto free_mp; answer = !strncmp(xsk_prog_name, xdp_program__name(xsk_prog), sizeof(xsk_prog_name)); free_mp: xdp_multiprog__close(mp); return answer; } static void update_reference_refcnt(struct xsk_test_event *event, int *refcnt) { *refcnt += event->num_create; *refcnt -= event->num_delete; } static bool check_run_event(struct xsk_test_event *event, int *refcnt) { pthread_t threads[MAX_NUM_QUEUES]; bool prog_attached, prog_needed; u32 thread_num = 0, i; int ret; update_reference_refcnt(event, refcnt); for (i = 0; i < event->num_create; i++) { ret = pthread_create(&threads[thread_num++], NULL, &create_socket, &event->create_qids[i]); if (ret) exit(ret); } for (i = 0; i < event->num_delete; i++) { ret = pthread_create(&threads[thread_num++], NULL, &delete_socket, &event->delete_qids[i]); if (ret) exit(ret); } for (i = 0; i < thread_num; i++) pthread_join(threads[i], NULL); prog_attached = xsk_prog_attached(); prog_needed = *refcnt > 0; if (prog_needed != prog_attached) { printf("Program is referenced by %d sockets, but is %s attached\n", *refcnt, prog_attached ? "still" : "not"); return false; } return true; } static bool check_run_test(struct xsk_test *test) { bool test_ok = false; int refcnt = 0; u32 i = 0; for (i = 0; i < test->num_events; i++) { if (!check_run_event(&test->events[i], &refcnt)) { printf("Event %u failed\n", i); goto print_result; } } /* Do not let tests interfere with each other */ sleep(1); test_ok = true; print_result: printf("%s: %s\n", test->name, test_ok ? "PASSED" : "FAILED"); return test_ok; } static int read_args(int argc, char **argv) { if (argc != 2) return -1; opt_if = argv[1]; return 0; } int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; u8 i = 0; if (read_args(argc, argv)) return -1; if (setrlimit(RLIMIT_MEMLOCK, &r)) { fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", strerror(errno)); exit(EXIT_FAILURE); } silence_libbpf_logging(); for (i = 0; i < num_tests; i++) { if (!check_run_test(&all_tests[i])) exit(EXIT_FAILURE); } return 0; } xdp-tools-1.6.1/lib/libxdp/tests/test_xsk_umem_flags.c000066400000000000000000000065021514310632100230500ustar00rootroot00000000000000// SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) #include #include #include #include #include #include #include #include #include "test_utils.h" #include #include #define NUM_DESCS ((XSK_RING_PROD__DEFAULT_NUM_DESCS \ + XSK_RING_CONS__DEFAULT_NUM_DESCS) * 2) #define UMEM_SIZE (NUM_DESCS * XSK_UMEM__DEFAULT_FRAME_SIZE) static void update_rlimit_memlock(void) { struct rlimit rlim = { .rlim_cur = UMEM_SIZE, .rlim_max = UMEM_SIZE }; if (setrlimit(RLIMIT_MEMLOCK, &rlim)) { perror("setrlimit(RLIMIT_MEMLOCK) failed"); exit(EXIT_FAILURE); } } static struct xsk_umem *create_umem_with_flags() { struct xsk_umem *umem = NULL; struct xsk_ring_cons cq; struct xsk_ring_prod fq; void *b; if (posix_memalign(&b, getpagesize(), UMEM_SIZE)) { perror("posix_memalign failed"); exit(EXIT_FAILURE); } /* This variant uses a frame_size that is not a power of 2 without * flags, should fail. */ DECLARE_LIBXDP_OPTS(xsk_umem_opts, opts_no_flags, .size = UMEM_SIZE - 1, .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE - 1, ); umem = xsk_umem__create_opts(b, &fq, &cq, &opts_no_flags); if (umem) { perror("xsk_umem__create_opts with odd frame_size " "unexpectedly succeeded"); exit(EXIT_FAILURE); } /* This variant uses a frame_size that is not a power of 2 with flags, * should succeed. * * A failure here may indicate a mismatch in struct xdp_umem_reg * between user space and kernel space, and that fall back processing * is happening in the kernel. (Ref: LP: #2098005 and PR #477). */ DECLARE_LIBXDP_OPTS(xsk_umem_opts, opts, .size = UMEM_SIZE - 1, .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE - 1, .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG, ); umem = xsk_umem__create_opts(b, &fq, &cq, &opts); if (!umem) { perror("xsk_umem__create_opts failed"); exit(EXIT_FAILURE); } return umem; } static struct xsk_socket *create_xsk(const char *ifname, struct xsk_umem *umem, int queue_id) { struct xsk_socket *xsk = NULL; struct xsk_ring_cons rx; struct xsk_ring_prod tx; DECLARE_LIBXDP_OPTS(xsk_socket_opts, opts, .rx = &rx, .tx = &tx, .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, .libxdp_flags = XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD, .bind_flags = XDP_USE_NEED_WAKEUP, .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST, ); xsk = xsk_socket__create_opts(ifname, queue_id, umem, &opts); if (!xsk) { perror("xsk_socket__create_opts failed"); exit(EXIT_FAILURE); } return xsk; } int main(int argc, const char *argv[]) { struct xsk_socket *xsk; struct xsk_umem *umem; int ifindex, queue_id; const char *ifname; silence_libbpf_logging(); if (argc < 2) { printf("Usage: %s \n", argv[0]); exit(EXIT_FAILURE); } update_rlimit_memlock(); ifname = argv[1]; queue_id = 0; ifindex = if_nametoindex(ifname); if (!ifindex) { perror("if_nametoindex(ifname) failed"); exit(EXIT_FAILURE); } umem = create_umem_with_flags(); xsk = create_xsk(ifname, umem, queue_id); xsk_socket__delete(xsk); return EXIT_SUCCESS; } xdp-tools-1.6.1/lib/libxdp/tests/xdp_dispatcher.h000066400000000000000000000016451514310632100220160ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __XDP_DISPATCHER_H #define __XDP_DISPATCHER_H #ifndef MAX_DISPATCHER_ACTIONS #define MAX_DISPATCHER_ACTIONS 10 #endif struct xdp_dispatcher_config_v1 { __u8 num_progs_enabled; __u32 chain_call_actions[MAX_DISPATCHER_ACTIONS]; __u32 run_prios[MAX_DISPATCHER_ACTIONS]; }; #define XDP_DISPATCHER_VERSION_V1 1 struct xdp_dispatcher_config_v2 { __u8 magic; /* Set to XDP_DISPATCHER_MAGIC */ __u8 dispatcher_version; /* Set to XDP_DISPATCHER_VERSION */ __u8 num_progs_enabled; /* Number of active program slots */ __u8 is_xdp_frags; /* Whether this dispatcher is loaded with XDP frags support */ __u32 chain_call_actions[MAX_DISPATCHER_ACTIONS]; __u32 run_prios[MAX_DISPATCHER_ACTIONS]; __u32 program_flags[MAX_DISPATCHER_ACTIONS]; }; #define XDP_DISPATCHER_MAGIC 236 #define XDP_DISPATCHER_VERSION_V2 2 #endif xdp-tools-1.6.1/lib/libxdp/tests/xdp_dispatcher_v1.c000066400000000000000000000017051514310632100224140ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #include #include #include #include "xdp_dispatcher.h" #define XDP_METADATA_SECTION "xdp_metadata" #define XDP_DISPATCHER_RETVAL 31 static volatile const struct xdp_dispatcher_config_v1 conf = {}; __attribute__ ((__noinline__)) int prog0(struct xdp_md *ctx) { volatile int ret = XDP_DISPATCHER_RETVAL; if (!ctx) return XDP_ABORTED; return ret; } __attribute__ ((__noinline__)) SEC("xdp") int xdp_dispatcher(struct xdp_md *ctx) { __u8 num_progs_enabled = conf.num_progs_enabled; int ret; if (num_progs_enabled < 1) goto out; ret = prog0(ctx); if (!((1U << ret) & conf.chain_call_actions[0])) return ret; out: return XDP_PASS; } char _license[] SEC("license") = "GPL"; __uint(dispatcher_version, XDP_DISPATCHER_VERSION_V1) SEC(XDP_METADATA_SECTION); xdp-tools-1.6.1/lib/libxdp/tests/xdp_dispatcher_v2.c000066400000000000000000000016751514310632100224230ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #include #include #include #include "xdp_dispatcher.h" #define XDP_METADATA_SECTION "xdp_metadata" #define XDP_DISPATCHER_RETVAL 31 static volatile const struct xdp_dispatcher_config_v2 conf = {}; __attribute__ ((noinline)) int prog0(struct xdp_md *ctx) { volatile int ret = XDP_DISPATCHER_RETVAL; if (!ctx) return XDP_ABORTED; return ret; } __attribute__ ((noinline)) SEC("xdp") int xdp_dispatcher(struct xdp_md *ctx) { __u8 num_progs_enabled = conf.num_progs_enabled; int ret; if (num_progs_enabled < 1) goto out; ret = prog0(ctx); if (!((1U << ret) & conf.chain_call_actions[0])) return ret; out: return XDP_PASS; } char _license[] SEC("license") = "GPL"; __uint(dispatcher_version, XDP_DISPATCHER_VERSION_V2) SEC(XDP_METADATA_SECTION); xdp-tools-1.6.1/lib/libxdp/tests/xdp_pass.c000066400000000000000000000003161514310632100206230ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #include #include SEC("xdp") int xdp_pass(struct xdp_md *ctx) { return XDP_PASS; } char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/lib/libxdp/xdp-dispatcher.c.in000066400000000000000000000051601514310632100211660ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ divert(-1) #forloop definition taken from example in the M4 manual define(`forloop', `pushdef(`$1', `$2')_forloop($@)popdef(`$1')') define(`_forloop',`$4`'ifelse($1, decr(`$3'), `', `define(`$1', incr($1))$0($@)')') define(`NUM_PROGS',ifdef(`MAX_DISPATCHER_ACTIONS', MAX_DISPATCHER_ACTIONS, `10')) divert(0)dnl #include #include #include #include /* While 'const volatile' sounds a little like an oxymoron, there's reason * behind the madness: * * - const places the data in rodata, where libbpf will mark it as read-only and * frozen on program load, letting the kernel do dead code elimination based * on the values. * * - volatile prevents the compiler from optimising away the checks based on the * compile-time value of the variables, which is important since we will be * changing the values before loading the program into the kernel. */ static volatile const struct xdp_dispatcher_config conf = {}; /* The volatile return value prevents the compiler from assuming it knows the * return value and optimising based on that. * * The function includes a no-op xdp_adjust_tail() call before returning, to * make sure the verifier doesn't disallow freplace with programs that * invalidate the packet data pointer. */ forloop(`i', `0', NUM_PROGS, `__attribute__ ((__noinline__)) int format(`prog%d', i)(struct xdp_md *ctx) { volatile int ret = XDP_DISPATCHER_RETVAL; if (!ctx) return XDP_ABORTED; bpf_xdp_adjust_tail(ctx, 0); return ret; } ') __attribute__ ((__noinline__)) int compat_test(struct xdp_md *ctx) { volatile int ret = XDP_DISPATCHER_RETVAL; if (!ctx) return XDP_ABORTED; return ret; } SEC("xdp") int xdp_dispatcher(struct xdp_md *ctx) { __u8 num_progs_enabled = conf.num_progs_enabled; int ret; forloop(`i', `0', NUM_PROGS, ` if (num_progs_enabled < incr(i)) goto out; ret = format(`prog%d', i)(ctx); if (!((1U << ret) & conf.chain_call_actions[i])) return ret; ') /* keep a reference to the compat_test() function so we can use it * as an freplace target in xdp_multiprog__check_compat() in libxdp */ if (num_progs_enabled < incr(NUM_PROGS)) goto out; ret = compat_test(ctx); out: return XDP_PASS; } SEC("xdp") int xdp_pass(struct xdp_md *ctx) { return XDP_PASS; } char _license[] SEC("license") = "GPL"; __uint(dispatcher_version, XDP_DISPATCHER_VERSION) SEC(XDP_METADATA_SECTION); xdp-tools-1.6.1/lib/libxdp/xsk.c000066400000000000000000001020071514310632100164450ustar00rootroot00000000000000// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) /* * AF_XDP user-space access library. * * Copyright(c) 2018 - 2021 Intel Corporation. * * Author(s): Magnus Karlsson */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "libxdp_internal.h" #include "xsk_def_xdp_prog.h" #include "bpf_instr.h" #ifndef SOL_XDP #define SOL_XDP 283 #endif #ifndef AF_XDP #define AF_XDP 44 #endif #ifndef PF_XDP #define PF_XDP AF_XDP #endif #ifndef SO_NETNS_COOKIE #define SO_NETNS_COOKIE 71 #endif #define INIT_NS 1 struct xsk_umem { struct xsk_ring_prod *fill_save; struct xsk_ring_cons *comp_save; char *umem_area; struct xsk_umem_config config; int fd; int refcount; struct list_head ctx_list; bool rx_ring_setup_done; bool tx_ring_setup_done; }; struct xsk_ctx { struct xsk_ring_prod *fill; struct xsk_ring_cons *comp; struct xsk_umem *umem; __u32 queue_id; int refcount; int ifindex; __u64 netns_cookie; int xsks_map_fd; struct list_head list; struct xdp_program *xdp_prog; int refcnt_map_fd; char ifname[IFNAMSIZ]; }; struct xsk_socket { struct xsk_ring_cons *rx; struct xsk_ring_prod *tx; struct xsk_ctx *ctx; struct xsk_socket_config config; int fd; }; struct xsk_nl_info { int ifindex; int fd; bool xdp_prog_attached; }; /* Up until and including Linux 5.3 */ struct xdp_ring_offset_v1 { __u64 producer; __u64 consumer; __u64 desc; }; /* Up until and including Linux 5.3 */ struct xdp_mmap_offsets_v1 { struct xdp_ring_offset_v1 rx; struct xdp_ring_offset_v1 tx; struct xdp_ring_offset_v1 fr; struct xdp_ring_offset_v1 cr; }; /* Export all inline helpers as symbols for use by language bindings. */ extern inline __u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill, __u32 idx); extern inline const __u64 * xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx); extern inline struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx, __u32 idx); extern inline const struct xdp_desc * xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx); extern inline int xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r); extern inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb); extern inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb); extern inline __u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, __u32 *idx); extern inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb); extern inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx); extern inline void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, __u32 nb); extern inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb); extern inline void *xsk_umem__get_data(void *umem_area, __u64 addr); extern inline __u64 xsk_umem__extract_addr(__u64 addr); extern inline __u64 xsk_umem__extract_offset(__u64 addr); extern inline __u64 xsk_umem__add_offset_to_addr(__u64 addr); int xsk_umem__fd(const struct xsk_umem *umem) { return umem ? umem->fd : -EINVAL; } int xsk_socket__fd(const struct xsk_socket *xsk) { return xsk ? xsk->fd : -EINVAL; } static bool xsk_page_aligned(void *buffer) { unsigned long addr = (unsigned long)buffer; return !(addr & (getpagesize() - 1)); } static void xsk_set_umem_config(struct xsk_umem_config *cfg, const struct xsk_umem_opts *opts) { cfg->fill_size = OPTS_GET(opts, fill_size, 0) ?: XSK_RING_PROD__DEFAULT_NUM_DESCS; cfg->comp_size = OPTS_GET(opts, comp_size, 0) ?: XSK_RING_CONS__DEFAULT_NUM_DESCS; cfg->frame_size = OPTS_GET(opts, frame_size, 0) ?: XSK_UMEM__DEFAULT_FRAME_SIZE; cfg->frame_headroom = OPTS_GET(opts, frame_headroom, 0) ?: XSK_UMEM__DEFAULT_FRAME_HEADROOM; cfg->flags = OPTS_GET(opts, flags, 0) ?: XSK_UMEM__DEFAULT_FLAGS; } static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg, const struct xsk_socket_opts *opts) { __u32 libxdp_flags; libxdp_flags = OPTS_GET(opts, libxdp_flags, 0); if (libxdp_flags & ~XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD) return -EINVAL; cfg->rx_size = OPTS_GET(opts, rx_size, 0) ?: XSK_RING_CONS__DEFAULT_NUM_DESCS; cfg->tx_size = OPTS_GET(opts, tx_size, 0) ?: XSK_RING_PROD__DEFAULT_NUM_DESCS; cfg->libxdp_flags = libxdp_flags; cfg->xdp_flags = OPTS_GET(opts, xdp_flags, 0); cfg->bind_flags = OPTS_GET(opts, bind_flags, 0); return 0; } static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off) { struct xdp_mmap_offsets_v1 off_v1; /* getsockopt on a kernel <= 5.3 has no flags fields. * Copy over the offsets to the correct places in the >=5.4 format * and put the flags where they would have been on that kernel. */ memcpy(&off_v1, off, sizeof(off_v1)); off->rx.producer = off_v1.rx.producer; off->rx.consumer = off_v1.rx.consumer; off->rx.desc = off_v1.rx.desc; off->rx.flags = off_v1.rx.consumer + sizeof(__u32); off->tx.producer = off_v1.tx.producer; off->tx.consumer = off_v1.tx.consumer; off->tx.desc = off_v1.tx.desc; off->tx.flags = off_v1.tx.consumer + sizeof(__u32); off->fr.producer = off_v1.fr.producer; off->fr.consumer = off_v1.fr.consumer; off->fr.desc = off_v1.fr.desc; off->fr.flags = off_v1.fr.consumer + sizeof(__u32); off->cr.producer = off_v1.cr.producer; off->cr.consumer = off_v1.cr.consumer; off->cr.desc = off_v1.cr.desc; off->cr.flags = off_v1.cr.consumer + sizeof(__u32); } static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off) { socklen_t optlen; int err; optlen = sizeof(*off); err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen); if (err) return err; if (optlen == sizeof(*off)) return 0; if (optlen == sizeof(struct xdp_mmap_offsets_v1)) { xsk_mmap_offsets_v1(off); return 0; } return -EINVAL; } static int xsk_create_umem_rings(struct xsk_umem *umem, int fd, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp) { struct xdp_mmap_offsets off; void *map_f, *map_c; int err; err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING, &umem->config.fill_size, sizeof(umem->config.fill_size)); if (err) return -errno; err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &umem->config.comp_size, sizeof(umem->config.comp_size)); if (err) return -errno; err = xsk_get_mmap_offsets(fd, &off); if (err) return -errno; map_f = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, XDP_UMEM_PGOFF_FILL_RING); if (map_f == MAP_FAILED) return -errno; fill->mask = umem->config.fill_size - 1; fill->size = umem->config.fill_size; fill->producer = map_f + off.fr.producer; fill->consumer = map_f + off.fr.consumer; fill->flags = map_f + off.fr.flags; fill->ring = map_f + off.fr.desc; fill->cached_cons = umem->config.fill_size; map_c = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, XDP_UMEM_PGOFF_COMPLETION_RING); if (map_c == MAP_FAILED) { err = -errno; goto out_mmap; } comp->mask = umem->config.comp_size - 1; comp->size = umem->config.comp_size; comp->producer = map_c + off.cr.producer; comp->consumer = map_c + off.cr.consumer; comp->flags = map_c + off.cr.flags; comp->ring = map_c + off.cr.desc; return 0; out_mmap: munmap(map_f, off.fr.desc + umem->config.fill_size * sizeof(__u64)); return err; } struct xsk_umem *xsk_umem__create_opts(void *umem_area, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, struct xsk_umem_opts *opts) { struct xdp_umem_reg mr; struct xsk_umem *umem; int err, fd; __u64 size; if (!umem_area || !fill || !comp) { err = -EFAULT; goto err; } if (!OPTS_VALID(opts, xsk_umem_opts)) { err = -EINVAL; goto err; } fd = OPTS_GET(opts, fd, 0); size = OPTS_GET(opts, size, 0); if (!size && !xsk_page_aligned(umem_area)) { err = -EINVAL; goto err; } umem = calloc(1, sizeof(*umem)); if (!umem) { err = -ENOMEM; goto err; } umem->fd = fd > 0 ? fd : socket(AF_XDP, SOCK_RAW, 0); if (umem->fd < 0) { err = -errno; goto out_umem_alloc; } umem->umem_area = umem_area; INIT_LIST_HEAD(&umem->ctx_list); xsk_set_umem_config(&umem->config, opts); memset(&mr, 0, sizeof(mr)); mr.addr = (uintptr_t)umem_area; mr.len = size; mr.chunk_size = umem->config.frame_size; mr.headroom = umem->config.frame_headroom; mr.flags = umem->config.flags; mr.tx_metadata_len = OPTS_GET(opts, tx_metadata_len, XSK_UMEM__DEFAULT_TX_METADATA_LEN); err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)); if (err) { err = -errno; goto out_socket; } err = xsk_create_umem_rings(umem, umem->fd, fill, comp); if (err) goto out_socket; umem->fill_save = fill; umem->comp_save = comp; return umem; out_socket: close(umem->fd); out_umem_alloc: free(umem); err: return libxdp_err_ptr(err, true); } int xsk_umem__create_with_fd(struct xsk_umem **umem_ptr, int fd, void *umem_area, __u64 size, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, const struct xsk_umem_config *usr_config) { struct xsk_umem *umem; if (!umem_ptr) return -EFAULT; DECLARE_LIBXDP_OPTS(xsk_umem_opts, opts, .fd = fd, .size = size, ); if (usr_config) { opts.fill_size = usr_config->fill_size; opts.comp_size = usr_config->comp_size; opts.frame_size = usr_config->frame_size; opts.frame_headroom = usr_config->frame_headroom; opts.flags = usr_config->flags; } umem = xsk_umem__create_opts(umem_area, fill, comp, &opts); if (!umem) return -errno; *umem_ptr = umem; return 0; } int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, const struct xsk_umem_config *usr_config) { return xsk_umem__create_with_fd(umem_ptr, 0, umem_area, size, fill, comp, usr_config); } static int xsk_init_xsk_struct(struct xsk_socket *xsk, int ifindex) { char ifname[IFNAMSIZ]; struct xsk_ctx *ctx; char *interface; ctx = calloc(1, sizeof(*ctx)); if (!ctx) return -ENOMEM; interface = if_indextoname(ifindex, &ifname[0]); if (!interface) { free(ctx); return -errno; } ctx->ifindex = ifindex; memcpy(ctx->ifname, ifname, IFNAMSIZ -1); ctx->ifname[IFNAMSIZ - 1] = 0; xsk->ctx = ctx; return 0; } static enum xdp_attach_mode xsk_convert_xdp_flags(__u32 xdp_flags) { if (xdp_flags & ~XDP_FLAGS_MASK) pr_warn("XDP flag: 0x%x contains flags not supported by libxdp.\n", xdp_flags); if (xdp_flags & XDP_FLAGS_SKB_MODE) return XDP_MODE_SKB; if (xdp_flags & XDP_FLAGS_DRV_MODE) return XDP_MODE_NATIVE; if (xdp_flags & XDP_FLAGS_HW_MODE) return XDP_MODE_HW; return XDP_MODE_NATIVE; } #define MAX_DEV_QUEUE_PATH_LEN 64 static void xsk_get_queues_from_sysfs(const char* ifname, __u32 *rx, __u32 *tx) { char buf[MAX_DEV_QUEUE_PATH_LEN]; struct dirent *entry; DIR *dir; int err; *rx = *tx = 0; err = try_snprintf(buf, MAX_DEV_QUEUE_PATH_LEN, "/sys/class/net/%s/queues/", ifname); if (err) return; dir = opendir(buf); if(dir == NULL) return; while((entry = readdir(dir))) { if (0 == strncmp(entry->d_name, "rx", 2)) ++*rx; if (0 == strncmp(entry->d_name, "tx", 2)) ++*tx; } closedir(dir); } static int xsk_get_max_queues(char *ifname) { struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; struct ifreq ifr = {}; int fd, err, ret; fd = socket(AF_LOCAL, SOCK_DGRAM, 0); if (fd < 0) return -errno; ifr.ifr_data = (void *)&channels; memcpy(ifr.ifr_name, ifname, IFNAMSIZ - 1); ifr.ifr_name[IFNAMSIZ - 1] = '\0'; err = ioctl(fd, SIOCETHTOOL, &ifr); if (err && errno != EOPNOTSUPP) { ret = -errno; goto out; } if (err) { /* If the device says it has no channels, * try to get rx tx from sysfs, otherwise all traffic * is sent to a single stream, so max queues = 1. */ __u32 rx, tx; xsk_get_queues_from_sysfs(ifr.ifr_name, &rx, &tx); ret = max(max(rx, tx), 1); } else { /* Take the max of rx, tx, combined. Drivers return * the number of channels in different ways. */ ret = max(channels.max_rx, channels.max_tx); ret = max(ret, (int)channels.max_combined); } out: close(fd); return ret; } static int xsk_size_map(struct xdp_program *xdp_prog, char *ifname) { struct bpf_object *bpf_obj = xdp_program__bpf_obj(xdp_prog); struct bpf_map *map; int max_queues; int err; max_queues = xsk_get_max_queues(ifname); if (max_queues < 0) return max_queues; map = bpf_object__find_map_by_name(bpf_obj, "xsks_map"); if (!map) return -ENOENT; err = bpf_map__set_max_entries(map, max_queues); if (err) return err; return 0; } static void xsk_delete_map_entry(int xsks_map_fd, __u32 queue_id) { bpf_map_delete_elem(xsks_map_fd, &queue_id); close(xsks_map_fd); } static int xsk_lookup_map_by_filter(int prog_fd, bool (*map_info_filter)(struct bpf_map_info *map_info)) { __u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info); __u32 map_len = sizeof(struct bpf_map_info); struct bpf_prog_info prog_info = {}; int fd, err, xsks_map_fd = -ENOENT; struct bpf_map_info map_info; err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_len); if (err) return err; num_maps = prog_info.nr_map_ids; map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids)); if (!map_ids) return -ENOMEM; memset(&prog_info, 0, prog_len); prog_info.nr_map_ids = num_maps; prog_info.map_ids = (__u64)(unsigned long)map_ids; err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_len); if (err) { free(map_ids); return err; } for (i = 0; i < prog_info.nr_map_ids; i++) { fd = bpf_map_get_fd_by_id(map_ids[i]); if (fd < 0) continue; memset(&map_info, 0, map_len); err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len); if (err) { close(fd); continue; } if (map_info_filter(&map_info)) { xsks_map_fd = fd; break; } close(fd); } free(map_ids); return xsks_map_fd; } static bool xsk_map_is_socket_map(struct bpf_map_info *map_info) { return !strncmp(map_info->name, "xsks_map", sizeof(map_info->name)) && map_info->key_size == 4 && map_info->value_size == 4; } static bool xsk_map_is_refcnt_map(struct bpf_map_info *map_info) { /* In order to avoid confusing users with multiple identically named * maps, libbpf names non-custom internal maps (.data, .bss, etc.) * in an unexpected way, namely the first 8 characters of a bpf object * name + a suffix signifying the internal map type, * ex. "xdp_def_" + ".data". */ return !strncmp(map_info->name, "xsk_def_.data", sizeof(map_info->name)) && map_info->value_size >= sizeof(int); } static int xsk_lookup_bpf_map(int prog_fd) { return xsk_lookup_map_by_filter(prog_fd, &xsk_map_is_socket_map); } static int xsk_lookup_refcnt_map(int prog_fd, const char *xdp_filename) { int map_fd = xsk_lookup_map_by_filter(prog_fd, &xsk_map_is_refcnt_map); if (map_fd >= 0) goto out; if (map_fd != -ENOENT) { pr_debug("Error getting refcount map: %s\n", strerror(-map_fd)); goto out; } if (xdp_filename) pr_warn("Refcount was not found in %s or kernel does not support required features, so automatic program removal on unload is disabled\n", xdp_filename); else pr_warn("Another XSK socket was created by a version of libxdp that doesn't support program refcnt, so automatic program removal on unload is disabled.\n"); out: return map_fd; } #ifdef HAVE_LIBBPF_BPF_MAP_CREATE /* bpf_map_create() and the new bpf_prog_create() were added at the same time - * however there's a naming conflict with another bpf_prog_load() function in * older versions of libbpf; to avoid hitting that we create our own wrapper * function for this one even with new libbpf versions. */ static int xsk_check_create_prog(struct bpf_insn *insns, size_t insns_cnt) { return bpf_prog_load(BPF_PROG_TYPE_XDP, "testprog", "GPL", insns, insns_cnt, NULL); } #else static int bpf_map_create(enum bpf_map_type map_type, __unused const char *map_name, __u32 key_size, __u32 value_size, __u32 max_entries, __unused void *opts) { struct bpf_create_map_attr map_attr; memset(&map_attr, 0, sizeof(map_attr)); map_attr.map_type = map_type; map_attr.key_size = key_size; map_attr.value_size = value_size; map_attr.max_entries = max_entries; return bpf_create_map_xattr(&map_attr); } static int xsk_check_create_prog(struct bpf_insn *insns, size_t insns_cnt) { struct bpf_load_program_attr prog_attr; memset(&prog_attr, 0, sizeof(prog_attr)); prog_attr.prog_type = BPF_PROG_TYPE_XDP; prog_attr.insns = insns; prog_attr.insns_cnt = insns_cnt; prog_attr.license = "GPL"; return bpf_load_program_xattr(&prog_attr, NULL, 0); } #endif static bool xsk_check_redirect_flags(void) { char data_in = 0, data_out; DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, .data_in = &data_in, .data_out = &data_out, .data_size_in = 1); struct bpf_insn insns[] = { BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, XDP_PASS), BPF_EMIT_CALL(BPF_FUNC_redirect_map), BPF_EXIT_INSN(), }; int prog_fd, map_fd, ret; bool detected = false; map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xskmap", sizeof(int), sizeof(int), 1, NULL); if (map_fd < 0) return detected; insns[0].imm = map_fd; prog_fd = xsk_check_create_prog(insns, ARRAY_SIZE(insns)); if (prog_fd < 0) { close(map_fd); return detected; } ret = bpf_prog_test_run_opts(prog_fd, &opts); if (!ret && opts.retval == XDP_PASS) detected = true; close(prog_fd); close(map_fd); return detected; } static struct xdp_program *xsk_lookup_program(int ifindex) { const char *version_name = "xsk_prog_version"; const char *prog_name = "xsk_def_prog"; struct xdp_multiprog *multi_prog; struct xdp_program *prog = NULL; __u32 version; int err; multi_prog = xdp_multiprog__get_from_ifindex(ifindex); if (IS_ERR(multi_prog)) return NULL; if (xdp_multiprog__is_legacy(multi_prog)) { prog = xdp_multiprog__main_prog(multi_prog); prog = strcmp(xdp_program__name(prog), prog_name) ? NULL : prog; goto check; } while ((prog = xdp_multiprog__next_prog(prog, multi_prog))) if (!strcmp(xdp_program__name(prog), prog_name)) break; check: if (!prog) goto out; err = check_xdp_prog_version(xdp_program__btf(prog), version_name, &version); if (err) { prog = ERR_PTR(err); goto out; } if (version > XSK_PROG_VERSION) { pr_warn("XSK default program version %d higher than supported %d\n", version, XSK_PROG_VERSION); prog = ERR_PTR(-EOPNOTSUPP); } out: if (!IS_ERR_OR_NULL(prog)) prog = xdp_program__clone(prog, 0); xdp_multiprog__close(multi_prog); return prog; } static int xsk_update_prog_refcnt(int refcnt_map_fd, int delta) { struct bpf_map_info map_info = {}; __u32 info_len = sizeof(map_info); int *value_data = NULL; int lock_fd, ret; __u32 key = 0; ret = bpf_obj_get_info_by_fd(refcnt_map_fd, &map_info, &info_len); if (ret) return ret; value_data = calloc(1, map_info.value_size); if (!value_data) return -ENOMEM; lock_fd = xdp_lock_acquire(); if (lock_fd < 0) { ret = lock_fd; goto out; } /* Note, if other global variables are added before the refcnt, * this changes map's value type, not number of elements, * so additional offset must be applied to value_data, * when reading refcount, but map key always stays zero */ ret = bpf_map_lookup_elem(refcnt_map_fd, &key, value_data); if (ret) goto unlock; /* If refcount is 0, program is awaiting detach and can't be used */ if (*value_data) { *value_data += delta; ret = bpf_map_update_elem(refcnt_map_fd, &key, value_data, 0); if (ret) goto unlock; } ret = *value_data; unlock: xdp_lock_release(lock_fd); out: free(value_data); return ret; } static int xsk_incr_prog_refcnt(int refcnt_map_fd) { return xsk_update_prog_refcnt(refcnt_map_fd, 1); } static int xsk_decr_prog_refcnt(int refcnt_map_fd) { return xsk_update_prog_refcnt(refcnt_map_fd, -1); } static int __xsk_setup_xdp_prog(struct xsk_socket *xsk, int *xsks_map_fd) { const char *fallback_prog = "xsk_def_xdp_prog_5.3.o"; const char *default_prog = "xsk_def_xdp_prog.o"; struct xsk_ctx *ctx = xsk->ctx; const char *file_name = NULL; bool attached = false; int err; ctx->xdp_prog = xsk_lookup_program(ctx->ifindex); if (IS_ERR(ctx->xdp_prog)) return PTR_ERR(ctx->xdp_prog); ctx->refcnt_map_fd = -ENOENT; if (ctx->xdp_prog) { int refcnt; ctx->refcnt_map_fd = xsk_lookup_refcnt_map(xdp_program__fd(ctx->xdp_prog), NULL); if (ctx->refcnt_map_fd == -ENOENT) goto map_lookup; if (ctx->refcnt_map_fd < 0) { err = ctx->refcnt_map_fd; goto err_prog_load; } refcnt = xsk_incr_prog_refcnt(ctx->refcnt_map_fd); if (refcnt < 0) { err = refcnt; pr_debug("Error occurred when incrementing xsk XDP prog refcount: %s\n", strerror(-err)); goto err_prog_load; } if (!refcnt) { pr_warn("Current program is being detached, falling back on creating a new program\n"); close(ctx->refcnt_map_fd); ctx->refcnt_map_fd = -ENOENT; xdp_program__close(ctx->xdp_prog); ctx->xdp_prog = NULL; } } if (!ctx->xdp_prog) { file_name = xsk_check_redirect_flags() ? default_prog : fallback_prog; ctx->xdp_prog = xdp_program__find_file(file_name, NULL, NULL); if (IS_ERR(ctx->xdp_prog)) return PTR_ERR(ctx->xdp_prog); err = xsk_size_map(ctx->xdp_prog, ctx->ifname); if (err) goto err_prog_load; err = xdp_program__attach(ctx->xdp_prog, ctx->ifindex, xsk_convert_xdp_flags(xsk->config.xdp_flags), 0); if (err) goto err_prog_load; attached = true; } if (ctx->refcnt_map_fd < 0) { ctx->refcnt_map_fd = xsk_lookup_refcnt_map(xdp_program__fd(ctx->xdp_prog), file_name); if (ctx->refcnt_map_fd < 0 && ctx->refcnt_map_fd != -ENOENT) { err = ctx->refcnt_map_fd; goto err_prog_load; } } map_lookup: ctx->xsks_map_fd = xsk_lookup_bpf_map(xdp_program__fd(ctx->xdp_prog)); if (ctx->xsks_map_fd < 0) { err = ctx->xsks_map_fd; goto err_lookup; } if (xsk->rx) { err = bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id, &xsk->fd, 0); if (err) goto err_lookup; } if (xsks_map_fd) *xsks_map_fd = ctx->xsks_map_fd; return 0; err_lookup: if (attached) xdp_program__detach(ctx->xdp_prog, ctx->ifindex, xsk_convert_xdp_flags(xsk->config.xdp_flags), 0); err_prog_load: if (ctx->refcnt_map_fd >= 0) close(ctx->refcnt_map_fd); ctx->refcnt_map_fd = -ENOENT; xdp_program__close(ctx->xdp_prog); ctx->xdp_prog = NULL; return err; } static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, __u64 netns_cookie, int ifindex, __u32 queue_id) { struct xsk_ctx *ctx; if (list_empty(&umem->ctx_list)) return NULL; list_for_each_entry(ctx, &umem->ctx_list, list) { if (ctx->netns_cookie == netns_cookie && ctx->ifindex == ifindex && ctx->queue_id == queue_id) { ctx->refcount++; return ctx; } } return NULL; } static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap) { struct xsk_umem *umem = ctx->umem; struct xdp_mmap_offsets off; int err; if (--ctx->refcount) return; if (!unmap) goto out_free; err = xsk_get_mmap_offsets(umem->fd, &off); if (err) goto out_free; munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size * sizeof(__u64)); munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size * sizeof(__u64)); out_free: list_del(&ctx->list); free(ctx); } static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk, struct xsk_umem *umem, __u64 netns_cookie, int ifindex, const char *ifname, __u32 queue_id, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp) { struct xsk_ctx *ctx; int err; ctx = calloc(1, sizeof(*ctx)); if (!ctx) return NULL; if (!umem->fill_save) { err = xsk_create_umem_rings(umem, xsk->fd, fill, comp); if (err) { free(ctx); return NULL; } } else if (umem->fill_save != fill || umem->comp_save != comp) { /* Copy over rings to new structs. */ memcpy(fill, umem->fill_save, sizeof(*fill)); memcpy(comp, umem->comp_save, sizeof(*comp)); } ctx->netns_cookie = netns_cookie; ctx->ifindex = ifindex; ctx->refcount = 1; ctx->umem = umem; ctx->queue_id = queue_id; memcpy(ctx->ifname, ifname, IFNAMSIZ - 1); ctx->ifname[IFNAMSIZ - 1] = '\0'; ctx->fill = fill; ctx->comp = comp; list_add(&ctx->list, &umem->ctx_list); return ctx; } static void xsk_destroy_xsk_struct(struct xsk_socket *xsk) { xdp_program__close(xsk->ctx->xdp_prog); free(xsk->ctx); free(xsk); } int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd) { struct xsk_ctx *ctx = xsk->ctx; ctx->xsks_map_fd = fd; return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id, &xsk->fd, 0); } int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd) { struct xsk_socket *xsk; int res; xsk = calloc(1, sizeof(*xsk)); if (!xsk) return -ENOMEM; res = xsk_init_xsk_struct(xsk, ifindex); if (res) { free(xsk); return -EINVAL; } res = __xsk_setup_xdp_prog(xsk, xsks_map_fd); xsk_destroy_xsk_struct(xsk); return res; } struct xsk_socket *xsk_socket__create_opts(const char *ifname, __u32 queue_id, struct xsk_umem *umem, struct xsk_socket_opts *opts) { bool rx_setup_done = false, tx_setup_done = false; void *rx_map = NULL, *tx_map = NULL; struct sockaddr_xdp sxdp = {}; struct xdp_mmap_offsets off; struct xsk_ring_prod *fill; struct xsk_ring_cons *comp; struct xsk_ring_cons *rx; struct xsk_ring_prod *tx; struct xsk_socket *xsk; struct xsk_ctx *ctx; int err, ifindex; __u64 netns_cookie; socklen_t optlen; bool unmap; if (!OPTS_VALID(opts, xsk_socket_opts)) { err = -EINVAL; goto err; } rx = OPTS_GET(opts, rx, NULL); tx = OPTS_GET(opts, tx, NULL); fill = OPTS_GET(opts, fill, NULL); comp = OPTS_GET(opts, comp, NULL); if (!umem || !(rx || tx) || (fill == NULL) ^ (comp == NULL)) { err = -EFAULT; goto err; } if (!fill && !comp) { fill = umem->fill_save; comp = umem->comp_save; } xsk = calloc(1, sizeof(*xsk)); if (!xsk) { err = -ENOMEM; goto err; } err = xsk_set_xdp_socket_config(&xsk->config, opts); if (err) goto out_xsk_alloc; ifindex = if_nametoindex(ifname); if (!ifindex) { err = -errno; goto out_xsk_alloc; } if (umem->refcount++ > 0) { xsk->fd = socket(AF_XDP, SOCK_RAW, 0); if (xsk->fd < 0) { err = -errno; goto out_xsk_alloc; } } else { xsk->fd = umem->fd; rx_setup_done = umem->rx_ring_setup_done; tx_setup_done = umem->tx_ring_setup_done; } optlen = sizeof(netns_cookie); err = getsockopt(xsk->fd, SOL_SOCKET, SO_NETNS_COOKIE, &netns_cookie, &optlen); if (err) { if (errno != ENOPROTOOPT) { err = -errno; goto out_socket; } netns_cookie = INIT_NS; } ctx = xsk_get_ctx(umem, netns_cookie, ifindex, queue_id); if (!ctx) { if (!fill || !comp) { err = -EFAULT; goto out_socket; } ctx = xsk_create_ctx(xsk, umem, netns_cookie, ifindex, ifname, queue_id, fill, comp); if (!ctx) { err = -ENOMEM; goto out_socket; } } xsk->ctx = ctx; if (rx && !rx_setup_done) { err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING, &xsk->config.rx_size, sizeof(xsk->config.rx_size)); if (err) { err = -errno; goto out_put_ctx; } if (xsk->fd == umem->fd) umem->rx_ring_setup_done = true; } if (tx && !tx_setup_done) { err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING, &xsk->config.tx_size, sizeof(xsk->config.tx_size)); if (err) { err = -errno; goto out_put_ctx; } if (xsk->fd == umem->fd) umem->tx_ring_setup_done = true; } err = xsk_get_mmap_offsets(xsk->fd, &off); if (err) { err = -errno; goto out_put_ctx; } if (rx) { rx_map = mmap(NULL, off.rx.desc + xsk->config.rx_size * sizeof(struct xdp_desc), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, xsk->fd, XDP_PGOFF_RX_RING); if (rx_map == MAP_FAILED) { err = -errno; goto out_put_ctx; } rx->mask = xsk->config.rx_size - 1; rx->size = xsk->config.rx_size; rx->producer = rx_map + off.rx.producer; rx->consumer = rx_map + off.rx.consumer; rx->flags = rx_map + off.rx.flags; rx->ring = rx_map + off.rx.desc; rx->cached_prod = *rx->producer; rx->cached_cons = *rx->consumer; } xsk->rx = rx; if (tx) { tx_map = mmap(NULL, off.tx.desc + xsk->config.tx_size * sizeof(struct xdp_desc), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, xsk->fd, XDP_PGOFF_TX_RING); if (tx_map == MAP_FAILED) { err = -errno; goto out_mmap_rx; } tx->mask = xsk->config.tx_size - 1; tx->size = xsk->config.tx_size; tx->producer = tx_map + off.tx.producer; tx->consumer = tx_map + off.tx.consumer; tx->flags = tx_map + off.tx.flags; tx->ring = tx_map + off.tx.desc; tx->cached_prod = *tx->producer; /* cached_cons is r->size bigger than the real consumer pointer * See xsk_prod_nb_free */ tx->cached_cons = *tx->consumer + xsk->config.tx_size; } xsk->tx = tx; sxdp.sxdp_family = PF_XDP; sxdp.sxdp_ifindex = ctx->ifindex; sxdp.sxdp_queue_id = ctx->queue_id; if (umem->refcount > 1) { sxdp.sxdp_flags |= XDP_SHARED_UMEM; sxdp.sxdp_shared_umem_fd = umem->fd; } else { sxdp.sxdp_flags = xsk->config.bind_flags; } err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp)); if (err) { err = -errno; goto out_mmap_tx; } if (!(xsk->config.libxdp_flags & XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD)) { err = __xsk_setup_xdp_prog(xsk, NULL); if (err) goto out_mmap_tx; } umem->fill_save = NULL; umem->comp_save = NULL; return xsk; out_mmap_tx: if (tx) munmap(tx_map, off.tx.desc + xsk->config.tx_size * sizeof(struct xdp_desc)); out_mmap_rx: if (rx) munmap(rx_map, off.rx.desc + xsk->config.rx_size * sizeof(struct xdp_desc)); out_put_ctx: unmap = umem->fill_save != fill; xsk_put_ctx(ctx, unmap); out_socket: if (--umem->refcount) close(xsk->fd); out_xsk_alloc: free(xsk); err: return libxdp_err_ptr(err, true); } int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, const char *ifname, __u32 queue_id, struct xsk_umem *umem, struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, const struct xsk_socket_config *usr_config) { struct xsk_socket *xsk; if (!xsk_ptr) return -EFAULT; DECLARE_LIBXDP_OPTS(xsk_socket_opts, opts, .rx = rx, .tx = tx, .fill = fill, .comp = comp, ); if (usr_config) { opts.rx_size = usr_config->rx_size; opts.tx_size= usr_config->tx_size; opts.libxdp_flags = usr_config->libxdp_flags; opts.xdp_flags = usr_config->xdp_flags; opts.bind_flags = usr_config->bind_flags; } xsk = xsk_socket__create_opts(ifname, queue_id, umem, &opts); if (!xsk) return -errno; *xsk_ptr = xsk; return 0; } int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, __u32 queue_id, struct xsk_umem *umem, struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, const struct xsk_socket_config *usr_config) { if (!umem) return -EFAULT; return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem, rx, tx, umem->fill_save, umem->comp_save, usr_config); } int xsk_umem__delete(struct xsk_umem *umem) { struct xdp_mmap_offsets off; int err; if (!umem) return 0; if (umem->refcount) return -EBUSY; err = xsk_get_mmap_offsets(umem->fd, &off); if (!err && umem->fill_save && umem->comp_save) { munmap(umem->fill_save->ring - off.fr.desc, off.fr.desc + umem->config.fill_size * sizeof(__u64)); munmap(umem->comp_save->ring - off.cr.desc, off.cr.desc + umem->config.comp_size * sizeof(__u64)); } close(umem->fd); free(umem); return 0; } static void xsk_release_xdp_prog(struct xsk_socket *xsk) { struct xsk_ctx *ctx = xsk->ctx; int value; if (xsk->ctx->refcnt_map_fd < 0) goto out; value = xsk_decr_prog_refcnt(ctx->refcnt_map_fd); if (value < 0) pr_warn("Error occurred when decrementing xsk XDP prog refcount: %s, please detach program yourself\n", strerror(-value)); if (value) goto out; xdp_program__detach(ctx->xdp_prog, ctx->ifindex, xsk_convert_xdp_flags(xsk->config.xdp_flags), 0); out: xdp_program__close(ctx->xdp_prog); } void xsk_socket__delete(struct xsk_socket *xsk) { size_t desc_sz = sizeof(struct xdp_desc); struct xdp_mmap_offsets off; struct xsk_umem *umem; struct xsk_ctx *ctx; int err; if (!xsk) return; ctx = xsk->ctx; umem = ctx->umem; if (ctx->xdp_prog) { xsk_delete_map_entry(ctx->xsks_map_fd, ctx->queue_id); xsk_release_xdp_prog(xsk); } err = xsk_get_mmap_offsets(xsk->fd, &off); if (!err) { if (xsk->rx) { munmap(xsk->rx->ring - off.rx.desc, off.rx.desc + xsk->config.rx_size * desc_sz); } if (xsk->tx) { munmap(xsk->tx->ring - off.tx.desc, off.tx.desc + xsk->config.tx_size * desc_sz); } } xsk_put_ctx(ctx, true); umem->refcount--; /* Do not close an fd that also has an associated umem connected * to it. */ if (xsk->fd != umem->fd) close(xsk->fd); free(xsk); } xdp-tools-1.6.1/lib/libxdp/xsk_def_xdp_prog.c000066400000000000000000000020761514310632100211720ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #include #include #include #include "xsk_def_xdp_prog.h" #define DEFAULT_QUEUE_IDS 64 struct { __uint(type, BPF_MAP_TYPE_XSKMAP); __uint(key_size, sizeof(int)); __uint(value_size, sizeof(int)); __uint(max_entries, DEFAULT_QUEUE_IDS); } xsks_map SEC(".maps"); struct { __uint(priority, 20); __uint(XDP_PASS, 1); } XDP_RUN_CONFIG(xsk_def_prog); /* Program refcount, in order to work properly, * must be declared before any other global variables * and initialized with '1'. */ volatile int refcnt = 1; /* This is the program for post 5.3 kernels. */ SEC("xdp") int xsk_def_prog(struct xdp_md *ctx) { /* Make sure refcount is referenced by the program */ if (!refcnt) return XDP_PASS; /* A set entry here means that the corresponding queue_id * has an active AF_XDP socket bound to it. */ return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, XDP_PASS); } char _license[] SEC("license") = "GPL"; __uint(xsk_prog_version, XSK_PROG_VERSION) SEC(XDP_METADATA_SECTION); xdp-tools-1.6.1/lib/libxdp/xsk_def_xdp_prog.h000066400000000000000000000003611514310632100211720ustar00rootroot00000000000000// SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) #ifndef __LIBXDP_XSK_DEF_XDP_PROG_H #define __LIBXDP_XSK_DEF_XDP_PROG_H #define XDP_METADATA_SECTION "xdp_metadata" #define XSK_PROG_VERSION 1 #endif /* __LIBXDP_XSK_DEF_XDP_PROG_H */ xdp-tools-1.6.1/lib/libxdp/xsk_def_xdp_prog_5.3.c000066400000000000000000000022221514310632100215500ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #include #include #include #include "xsk_def_xdp_prog.h" #define DEFAULT_QUEUE_IDS 64 struct { __uint(type, BPF_MAP_TYPE_XSKMAP); __uint(key_size, sizeof(int)); __uint(value_size, sizeof(int)); __uint(max_entries, DEFAULT_QUEUE_IDS); } xsks_map SEC(".maps"); struct { __uint(priority, 20); __uint(XDP_PASS, 1); } XDP_RUN_CONFIG(xsk_def_prog); /* Program refcount, in order to work properly, * must be declared before any other global variables * and initialized with '1'. */ volatile int refcnt = 1; /* This is the program for 5.3 kernels and older. */ SEC("xdp") int xsk_def_prog(struct xdp_md *ctx) { int index = ctx->rx_queue_index; /* Make sure refcount is referenced by the program */ if (!refcnt) return XDP_PASS; /* A set entry here means that the corresponding queue_id * has an active AF_XDP socket bound to it. */ if (bpf_map_lookup_elem(&xsks_map, &index)) return bpf_redirect_map(&xsks_map, index, 0); return XDP_PASS; } char _license[] SEC("license") = "GPL"; __uint(xsk_prog_version, XSK_PROG_VERSION) SEC(XDP_METADATA_SECTION); xdp-tools-1.6.1/lib/testing/000077500000000000000000000000001514310632100156675ustar00rootroot00000000000000xdp-tools-1.6.1/lib/testing/.gitignore000066400000000000000000000000121514310632100176500ustar00rootroot00000000000000test-tool xdp-tools-1.6.1/lib/testing/Makefile000066400000000000000000000007251514310632100173330ustar00rootroot00000000000000# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) TEST_TARGETS := test-tool XDP_TARGETS := test_long_func_name xdp_drop xdp_pass xdp_adjust_tail SCRIPTS_FILES := test_runner.sh setup-netns-env.sh run_tests.sh XDP_OBJ_INSTALL := LIB_DIR = .. include $(LIB_DIR)/common.mk install_local:: install -m 0755 -d $(DESTDIR)$(SCRIPTSDIR) install -m 0644 test_config.install.sh $(DESTDIR)$(SCRIPTSDIR)/test_config.sh install -m 0644 $(XDP_OBJ) $(DESTDIR)$(SCRIPTSDIR)/ xdp-tools-1.6.1/lib/testing/run_tests.sh000077500000000000000000000006041514310632100202540ustar00rootroot00000000000000#!/bin/bash TEST_PROG_DIR="${TEST_PROG_DIR:-$(dirname "${BASH_SOURCE[0]}")}" TESTS_DIR="${TESTS_DIR:-$TEST_PROG_DIR/tests}" TEST_RUNNER="$TEST_PROG_DIR/test_runner.sh" RET=0 echo "Running all tests from $TESTS_DIR" for f in "$TESTS_DIR"/*/test-*.sh; do if [[ ! -f "$f" ]]; then echo "No tests found!" exit 1 fi "$TEST_RUNNER" "$f" || RET=1 done exit $RET xdp-tools-1.6.1/lib/testing/setup-netns-env.sh000077500000000000000000000010451514310632100213010ustar00rootroot00000000000000#!/bin/bash # SPDX-License-Identifier: GPL-2.0-or-later # # Script to setup things inside a test environment, used by testenv.sh for # executing commands. # # Author: Toke Høiland-Jørgensen (toke@redhat.com) # Date: 7 March 2019 # Copyright (c) 2019 Red Hat die() { echo "$1" >&2 exit 1 } [ -n "$TESTENV_NAME" ] || die "TESTENV_NAME missing from environment" [ -n "$1" ] || die "Usage: $0 " set -o nounset mount -t bpf bpf /sys/fs/bpf/ || die "Unable to mount /sys/fs/bpf inside test environment" exec "$@" xdp-tools-1.6.1/lib/testing/test-tool.c000066400000000000000000000146051514310632100177730ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "params.h" #include "logging.h" #include "util.h" #include "xdp_sample.h" #include "xdpsock.h" #include "compat.h" #define PROG_NAME "test-tool" struct enum_val xdp_modes[] = { {"native", XDP_MODE_NATIVE}, {"skb", XDP_MODE_SKB}, {"hw", XDP_MODE_HW}, {"unspecified", XDP_MODE_UNSPEC}, {NULL, 0} }; static const struct loadopt { bool help; enum xdp_attach_mode mode; struct iface iface; char *filename; } defaults_load = { .mode = XDP_MODE_NATIVE }; static struct bpf_object *open_bpf_obj(const char *filename, struct bpf_object_open_opts *opts) { struct bpf_object *obj; int err; obj = bpf_object__open_file(filename, opts); err = libbpf_get_error(obj); if (err) { if (err == -ENOENT) pr_debug( "Couldn't load the eBPF program (libbpf said 'no such file').\n" "Maybe the program was compiled with a too old " "version of LLVM (need v9.0+)?\n"); return ERR_PTR(err); } return obj; } static int do_xdp_attach(int ifindex, int prog_fd, int old_fd, __u32 xdp_flags) { #ifdef HAVE_LIBBPF_BPF_XDP_ATTACH LIBBPF_OPTS(bpf_xdp_attach_opts, opts, .old_prog_fd = old_fd); return bpf_xdp_attach(ifindex, prog_fd, xdp_flags, &opts); #else DECLARE_LIBBPF_OPTS(bpf_xdp_set_link_opts, opts, .old_fd = old_fd); return bpf_set_link_xdp_fd_opts(ifindex, prog_fd, xdp_flags, old_fd ? &opts : NULL); #endif } int do_load(const void *cfg, __unused const char *pin_root_path) { const struct loadopt *opt = cfg; struct bpf_program *bpf_prog; char errmsg[STRERR_BUFSIZE]; struct bpf_object *obj; int err = EXIT_SUCCESS; int xdp_flags; int prog_fd; silence_libbpf_logging(); retry: obj = open_bpf_obj(opt->filename, NULL); if (IS_ERR(obj)) { err = PTR_ERR(obj); if (err == -EPERM && !double_rlimit()) goto retry; libxdp_strerror(err, errmsg, sizeof(errmsg)); pr_warn("ERROR: Couldn't open file '%s': %s\n", opt->filename, errmsg); goto out; } err = bpf_object__load(obj); if (err) { if (err == -EPERM && !double_rlimit()) { bpf_object__close(obj); goto retry; } libbpf_strerror(err, errmsg, sizeof(errmsg)); pr_warn("ERROR: Can't load eBPF object: %s(%d)\n", errmsg, err); goto out; } bpf_prog = bpf_object__next_program(obj, NULL); if (!bpf_prog) { pr_warn("ERROR: Couldn't find xdp program in bpf object!\n"); err = -ENOENT; goto out; } prog_fd = bpf_program__fd(bpf_prog); if (prog_fd < 0) { err = prog_fd; libxdp_strerror(err, errmsg, sizeof(errmsg)); pr_warn("ERROR: Couldn't find xdp program's file descriptor: %s\n", errmsg); goto out; } xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; switch (opt->mode) { case XDP_MODE_SKB: xdp_flags |= XDP_FLAGS_SKB_MODE; break; case XDP_MODE_NATIVE: xdp_flags |= XDP_FLAGS_DRV_MODE; break; case XDP_MODE_HW: xdp_flags |= XDP_FLAGS_HW_MODE; break; case XDP_MODE_UNSPEC: break; } err = do_xdp_attach(opt->iface.ifindex, prog_fd, 0, xdp_flags); if (err < 0) { pr_info("ERROR: Failed attaching XDP program to ifindex %d: %s\n", opt->iface.ifindex, strerror(-err)); switch (-err) { case EBUSY: case EEXIST: pr_info("XDP already loaded on device.\n"); break; case EOPNOTSUPP: pr_info("XDP mode not supported; try using SKB mode.\n"); break; default: break; } goto out; } out: return err; } static struct prog_option load_options[] = { DEFINE_OPTION("mode", OPT_ENUM, struct loadopt, mode, .short_opt = 'm', .typearg = xdp_modes, .metavar = "", .help = "Load XDP program in ; default native"), DEFINE_OPTION("dev", OPT_IFNAME, struct loadopt, iface, .positional = true, .metavar = "", .required = true, .help = "Load on device "), DEFINE_OPTION("filename", OPT_STRING, struct loadopt, filename, .positional = true, .metavar = "", .required = true, .help = "Load program from "), END_OPTIONS }; enum probe_action { PROBE_CPUMAP_PROGRAM, PROBE_XDP_LOAD_BYTES, PROBE_XSK_BUSY_POLL, }; struct enum_val probe_actions[] = { {"cpumap-prog", PROBE_CPUMAP_PROGRAM}, {"xdp-load-bytes", PROBE_XDP_LOAD_BYTES}, {"xsk-busy-poll", PROBE_XSK_BUSY_POLL}, {NULL, 0} }; static const struct probeopt { enum probe_action action; } defaults_probe = {}; int do_probe(const void *cfg, __unused const char *pin_root_path) { const struct probeopt *opt = cfg; bool res = false; switch (opt->action) { case PROBE_CPUMAP_PROGRAM: #ifdef HAVE_BPFTOOL res = sample_probe_cpumap_compat(); #endif break; case PROBE_XDP_LOAD_BYTES: #ifdef HAVE_BPFTOOL res = sample_probe_xdp_load_bytes(); #endif break; case PROBE_XSK_BUSY_POLL: #ifdef HAVE_BPFTOOL res = xsk_probe_busy_poll(); #endif break; default: return EXIT_FAILURE; } pr_debug("Probing for %s: %s\n", probe_actions[opt->action].name, res ? "Supported" : "Unsupported"); return res ? EXIT_SUCCESS : EXIT_FAILURE; } static struct prog_option probe_options[] = { DEFINE_OPTION("action", OPT_ENUM, struct probeopt, action, .positional = true, .metavar = "", .required = true, .typearg = probe_actions, .help = "Probe for "), END_OPTIONS }; int do_help(__unused const void *cfg, __unused const char *pin_root_path) { fprintf(stderr, "Usage: test-tool COMMAND [options]\n" "\n" "COMMAND can be one of:\n" " load - load an XDP program on an interface\n" " probe - probe for kernel features\n" " help - show this help message\n" "\n" "Use 'test-tool COMMAND --help' to see options for each command\n"); return -1; } static const struct prog_command cmds[] = { DEFINE_COMMAND(load, "Load an XDP program on an interface"), DEFINE_COMMAND(probe, "Probe for kernel features"), { .name = "help", .func = do_help, .no_cfg = true }, END_COMMANDS }; union all_opts { struct loadopt load; struct probeopt probe; }; int main(int argc, char **argv) { if (argc > 1) return dispatch_commands(argv[1], argc - 1, argv + 1, cmds, sizeof(union all_opts), PROG_NAME, false); return do_help(NULL, NULL); } xdp-tools-1.6.1/lib/testing/test_config.install.sh000066400000000000000000000004171514310632100221760ustar00rootroot00000000000000# Test config for having tools in $PATH - to be installed along with the # test runners in /usr/share/xdp-tools XDPDUMP=xdpdump XDP_BENCH=xdp-bench XDP_FILTER=xdp-filter XDP_FORWARD=xdp-forward XDP_LOADER=xdp-loader XDP_MONITOR=xdp-monitor XDP_TRAFFICGEN=xdp-trafficgen xdp-tools-1.6.1/lib/testing/test_long_func_name.c000066400000000000000000000012231514310632100220420ustar00rootroot00000000000000#include #include #include #define bpf_debug(fmt, ...) \ { \ char __fmt[] = fmt; \ bpf_trace_printk(__fmt, sizeof(__fmt), \ ##__VA_ARGS__); \ } SEC("xdp") int xdp_test_prog_with_a_long_name(struct xdp_md *ctx) { bpf_debug("PASS[1]: prog %u\n", ctx->ingress_ifindex); return XDP_PASS; } SEC("xdp") int xdp_test_prog_with_a_long_name_too(struct xdp_md *ctx) { bpf_debug("PASS[2]: prog %u\n", ctx->ingress_ifindex); return XDP_PASS; } struct { __uint(priority, 5); __uint(XDP_PASS, 1); } XDP_RUN_CONFIG(xdp_test_prog_with_a_long_name); char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/lib/testing/test_runner.sh000077500000000000000000000321761514310632100206070ustar00rootroot00000000000000#!/bin/bash # SPDX-License-Identifier: GPL-2.0-or-later # # Script to setup and manage tests for xdp-tools. # Based on the test-env script from xdp-tutorial. # # Author: Toke Høiland-Jørgensen (toke@redhat.com) # Date: 26 May 2020 # Copyright (c) 2020 Red Hat set -o nounset umask 077 TEST_PROG_DIR="${TEST_PROG_DIR:-$(dirname "${BASH_SOURCE[0]}")}" SETUP_SCRIPT="$TEST_PROG_DIR/setup-netns-env.sh" TEST_CONFIG="$TEST_PROG_DIR/test_config.sh" IP6_SUBNET=fc42:dead:cafe # must have exactly three :-separated elements IP6_PREFIX_SIZE=64 # Size of assigned prefixes IP6_FULL_PREFIX_SIZE=48 # Size of IP6_SUBNET IP4_SUBNET=10.11 IP4_PREFIX_SIZE=24 # Size of assigned prefixes IP4_FULL_PREFIX_SIZE=16 # Size of IP4_SUBNET GENERATED_NAME_PREFIX="xdptest" ALL_TESTS="" VERBOSE_TESTS=${V:-0} NUM_NS=2 NEEDED_TOOLS="capinfos ethtool ip ping sed tc tcpdump timeout tshark nft socat ndisc6 arping" if [ -f "$TEST_CONFIG" ]; then source "$TEST_CONFIG" fi if command -v ping6 >/dev/null 2>&1; then PING6=ping6 else PING6=ping fi # Odd return value for skipping, as only 0-255 is valid. SKIPPED_TEST=249 # Global state variables that will be set by options etc below STATEDIR= CMD= NS= NS_NAMES=() IP6_PREFIX= IP4_PREFIX= INSIDE_IP6= INSIDE_IP4= INSIDE_MAC= OUTSIDE_IP6= OUTSIDE_IP4= OUTSIDE_MAC= ALL_INSIDE_IP6=() ALL_INSIDE_IP4=() is_trace_attach_supported() { if [[ -z "${TRACE_ATTACH_SUPPORT:-}" ]]; then [ -f "$STATEDIR/trace_attach_support" ] && \ TRACE_ATTACH_SUPPORT=$(< "$STATEDIR/trace_attach_support") if [[ -z "${TRACE_ATTACH_SUPPORT:-}" ]]; then RESULT=$($XDP_LOADER load -v "$NS" "$TEST_PROG_DIR/xdp_pass.o" 2>&1) PID=$(start_background "$XDPDUMP -i $NS") RESULT=$(stop_background "$PID") if [[ "$RESULT" == *"The kernel does not support fentry function attach"* ]]; then TRACE_ATTACH_SUPPORT="false" else TRACE_ATTACH_SUPPORT="true" fi echo "$TRACE_ATTACH_SUPPORT" > "$STATEDIR/trace_attach_support" $XDP_LOADER unload "$NS" --all fi fi if [[ "$TRACE_ATTACH_SUPPORT" == "true" ]]; then return 0 else return 1 fi } is_multiprog_supported() { if [[ -z "${MULTIPROG_SUPPORT:-}" ]]; then RESULT=$($XDP_LOADER load -v "$NS" "$TEST_PROG_DIR/xdp_pass.o" 2>&1) if [[ "$RESULT" == *"Compatibility check for dispatcher program failed"* ]]; then MULTIPROG_SUPPORT="false" else MULTIPROG_SUPPORT="true" fi $XDP_LOADER unload "$NS" --all fi if [[ "$MULTIPROG_SUPPORT" == "true" ]]; then return 0 else return 1 fi } is_progmap_supported() { if [[ -z "${PROGMAP_SUPPORT:-}" ]]; then RESULT=$(timeout -s INT 1 $XDP_BENCH redirect-cpu "$NS" -c 0 -r drop -vv 2>&1) if [[ "$RESULT" == *"Create CPU entry failed: Cannot allocate memory"* ]]; then PROGMAP_SUPPORT="false" else PROGMAP_SUPPORT="true" fi fi if [[ "$PROGMAP_SUPPORT" == "true" ]]; then return 0 else return 1 fi } is_xsk_busy_poll_supported() { $TEST_PROG_DIR/test-tool probe xsk-busy-poll } skip_if_missing_veth_rxq() { if ! ethtool -l $NS >/dev/null 2>&1; then exit "$SKIPPED_TEST" fi } skip_if_missing_cpumap_attach() { if ! $TEST_PROG_DIR/test-tool probe cpumap-prog; then exit "$SKIPPED_TEST" fi } skip_if_missing_xdp_load_bytes() { if ! $TEST_PROG_DIR/test-tool probe xdp-load-bytes; then exit "$SKIPPED_TEST" fi } skip_if_missing_kernel_symbol() { if ! grep -q "$1" /proc/kallsyms; then exit "$SKIPPED_TEST" fi } skip_if_legacy_fallback() { if ! is_multiprog_supported; then exit "$SKIPPED_TEST" fi } skip_if_missing_trace_attach() { if ! is_trace_attach_supported; then exit "$SKIPPED_TEST" fi } die() { echo "$1" >&2 exit 1 } mv_tmpfile() { local src="$1" local dst="$2" local MAXWAIT=100 while ! [ -f "$src" ]; do sleep 0.1 MAXWAIT=$[$MAXWAIT - 1] [ "$MAXWAIT" -eq 0 ] && break done mv "$src" "$dst" } start_background() { local TMP_FILE="${STATEDIR}/tmp_proc_$$_$RANDOM" setsid bash -c "$*" &> ${TMP_FILE} & local PID=$! mv_tmpfile "$TMP_FILE" "${STATEDIR}/proc/${PID}" echo "$PID" } start_background_wait_output() { local bg_func="$1" local out_grep="$2" local PID local outfile shift 2 PID=$($bg_func "$@") outfile="${STATEDIR}/proc/${PID}" local MAXWAIT=100 while ! grep -q "$out_grep" $outfile; do echo "Waiting for output '$out_grep' from PID $PID">&2 cat $outfile >&2 sleep 0.1 MAXWAIT=$[$MAXWAIT - 1] [ "$MAXWAIT" -eq 0 ] && break done echo "$PID" } start_tcpdump() { start_background_wait_output start_background "listening on" "$@" } start_background_no_stderr() { local TMP_FILE="${STATEDIR}/tmp_proc_$$_$RANDOM" setsid bash -c "$*" 1> ${TMP_FILE} 2>/dev/null & local PID=$! mv_tmpfile "$TMP_FILE" "${STATEDIR}/proc/${PID}" echo "$PID" } start_background_ns_devnull() { local TMP_FILE="${STATEDIR}/tmp_proc_$$_$RANDOM" setsid ip netns exec "$NS" env TESTENV_NAME="$NS" "$SETUP_SCRIPT" bash -c "$*" 1>/dev/null 2>${TMP_FILE} & local PID=$! mv_tmpfile "$TMP_FILE" "${STATEDIR}/proc/${PID}" echo $PID } start_socat_ns() { start_background_wait_output start_background_ns_devnull "listening on" "$@" } kill_process_group() { local PID=$1 kill -SIGINT -$PID for i in $(seq 10); do ps --ppid $PID -p $PID > /dev/null || return 0 sleep 0.1 done kill -TERM -$PID } stop_background() { local PID=$1 local OUTPUT_FILE="${STATEDIR}/proc/${PID}" local pids kill_process_group $PID if [ -f "$OUTPUT_FILE" ]; then cat "$OUTPUT_FILE" rm "$OUTPUT_FILE" >& /dev/null fi } check_prereq() { local max_locked_mem=$(ulimit -l) for t in $NEEDED_TOOLS; do command -v "$t" > /dev/null || die "Missing required tool: $t" done if [ "$EUID" -ne "0" ]; then die "This script needs root permissions to run." fi STATEDIR="$(mktemp -d --tmpdir=${TMPDIR:-/tmp} --suffix=.xdptest)" if [ $? -ne 0 ]; then die "Unable to create state dir in $TMPDIR" fi mkdir ${STATEDIR}/proc if [ "$max_locked_mem" != "unlimited" ]; then ulimit -l unlimited || die "Unable to set ulimit" fi mount -t bpf bpf /sys/fs/bpf/ || die "Unable to mount bpffs" } gen_nsname() { local nsname while nsname=$(printf "%s-%04x" "$GENERATED_NAME_PREFIX" $RANDOM) [ -e "$STATEDIR/${nsname}.ns" ] do true; done touch "$STATEDIR/${nsname}.ns" echo $nsname } iface_macaddr() { local iface="$1" local ns="${2:-}" [ -n "$ns" ] && ns="-n $ns" ip $ns -br link show dev "$iface" | awk '{print $3}' } set_sysctls() { local iface="$1" local in_ns="${2:-}" local nscmd= [ -n "$in_ns" ] && nscmd="ip netns exec $in_ns" local sysctls_off_v6=(accept_dad accept_ra mldv1_unsolicited_report_interval mldv2_unsolicited_report_interval) local sysctls_on=(forwarding) for s in ${sysctls_off_v6[*]}; do $nscmd sysctl -w net.ipv6.conf.$iface.${s}=0 >/dev/null done for s in ${sysctls_on[*]}; do $nscmd sysctl -w net.ipv6.conf.$iface.${s}=1 >/dev/null $nscmd sysctl -w net.ipv6.conf.all.${s}=1 >/dev/null $nscmd sysctl -w net.ipv4.conf.$iface.${s}=1 >/dev/null $nscmd sysctl -w net.ipv4.conf.all.${s}=1 >/dev/null done } init_ns() { local nsname=$1 local num=$2 local peername="testl-ve-$num" IP6_PREFIX="${IP6_SUBNET}:${num}::" IP4_PREFIX="${IP4_SUBNET}.$((0x$num))." INSIDE_IP6="${IP6_PREFIX}2" INSIDE_IP4="${IP4_PREFIX}2" OUTSIDE_IP6="${IP6_PREFIX}1" OUTSIDE_IP4="${IP4_PREFIX}1" ip netns add "$nsname" ip link add dev "$nsname" type veth peer name "$peername" set_sysctls $nsname ethtool -K "$nsname" rxvlan off txvlan off gro on ethtool -K "$peername" rxvlan off txvlan off gro on ip link set dev "$peername" multicast off ip link set dev "$nsname" multicast off ip link set dev "$peername" netns "$nsname" ip link set dev "$nsname" up ip addr add dev "$nsname" "${OUTSIDE_IP6}/${IP6_PREFIX_SIZE}" ip -n "$nsname" link set dev "$peername" name veth0 ip -n "$nsname" link set dev lo up ip -n "$nsname" link set dev veth0 up set_sysctls veth0 "$nsname" ip -n "$nsname" addr add dev veth0 "${INSIDE_IP6}/${IP6_PREFIX_SIZE}" OUTSIDE_MAC=$(iface_macaddr "$nsname") INSIDE_MAC=$(iface_macaddr "veth0" "$nsname") # Prevent neighbour queries on the link ip neigh add "$INSIDE_IP6" lladdr "$INSIDE_MAC" dev "$nsname" nud permanent ip -n "$nsname" neigh add "$OUTSIDE_IP6" lladdr "$OUTSIDE_MAC" dev veth0 nud permanent ip addr add dev "$nsname" "${OUTSIDE_IP4}/${IP4_PREFIX_SIZE}" ip -n "$nsname" addr add dev veth0 "${INSIDE_IP4}/${IP4_PREFIX_SIZE}" ip neigh add "$INSIDE_IP4" lladdr "$INSIDE_MAC" dev "$nsname" nud permanent ip -n "$nsname" neigh add "$OUTSIDE_IP4" lladdr "$OUTSIDE_MAC" dev veth0 nud permanent # Add default routes inside the ns ip -n "$nsname" route add default via $OUTSIDE_IP4 dev veth0 ip -n "$nsname" -6 route add default via $OUTSIDE_IP6 dev veth0 ALL_INSIDE_IP4+=($INSIDE_IP4) ALL_INSIDE_IP6+=($INSIDE_IP6) } setup() { local nsname set -o errexit check_prereq for i in $(seq $NUM_NS); do nsname=$(gen_nsname) init_ns $nsname $i NS_NAMES+=($nsname) done set +o errexit NS=$nsname } teardown_ns() { local nsname=$1 ip link del dev "$nsname" ip netns del "$nsname" [ -d "/sys/fs/bpf/$nsname" ] && rmdir "/sys/fs/bpf/$nsname" || true } teardown() { for ns in "${NS_NAMES[@]}"; do teardown_ns $ns done [ -d "$STATEDIR" ] || return 0 for f in ${STATEDIR}/proc/*; do if [ -f "$f" ]; then local pid="${f/${STATEDIR}\/proc\//}" stop_background "$pid" &> /dev/null || true fi done rm -rf "$STATEDIR" } ns_exec() { ip netns exec "$NS" env TESTENV_NAME="$NS" "$SETUP_SCRIPT" "$@" } is_func() { type "$1" 2>/dev/null | grep -q 'is a function' } check_run() { local ret "$@" ret=$? echo "Command '$@' exited with status $ret" echo "" if [ "$ret" -ne "0" ]; then exit $ret fi } exec_test() { local testn="$1" local output local ret local prefix local retries=${TEST_RETRIES:-1} prefix=$(printf " %-30s" "[$testn]") if ! is_func "$testn"; then echo "${prefix}INVALID" return 1 fi while [[ "$retries" -gt 0 ]]; do if [ "$VERBOSE_TESTS" -eq "1" ]; then echo "${prefix}START:" ($testn 2>&1) | sed 's/^/ /' ret=${PIPESTATUS[0]} echo " Test $testn exited with return code: $ret" else echo -n "$prefix" output=$($testn 2>&1) ret=$? prefix= fi if [ "$ret" -eq "0" ]; then break else retries=$[$retries - 1] [ "$VERBOSE_TESTS" -eq "1" ] && echo " Test failed - retrying $retries more times" if is_func cleanup_tests; then cleanup_tests || true fi fi done if [ "$ret" -eq "0" ]; then echo "${prefix}PASS" elif [ "$ret" -eq "$SKIPPED_TEST" ]; then echo "${prefix}SKIPPED" ret=0 else echo "${prefix}FAIL" fi if [ "$ret" -ne "0" ] && [ "$VERBOSE_TESTS" -ne "1" ]; then echo "$output" | sed 's/^/ /' echo " Test $testn exited with return code: $ret" fi return $ret } run_tests() { local TESTS="$*" local ret=0 [ -z "$TESTS" ] && TESTS="$ALL_TESTS" echo " Running tests from $TEST_DEFINITIONS" for testn in $TESTS; do exec_test $testn || ret=1 if is_func cleanup_tests; then cleanup_tests || true fi done return $ret } usage() { echo "Usage: $0 [test names]" >&2 exit 1 } if [ "$EUID" -ne "0" ]; then if command -v sudo >/dev/null 2>&1; then exec sudo env V=${VERBOSE_TESTS} DEBUG_TESTENV=${DEBUG_TESTENV:-0} "$0" "$@" else die "Tests should be run as root" fi else if [ "${DID_UNSHARE:-0}" -ne "1" ]; then echo " Executing tests in separate net- and mount namespaces" >&2 exec env DID_UNSHARE=1 unshare -n -m "$0" "$@" fi fi export XDPDUMP export XDP_BENCH export XDP_FILTER export XDP_FORWARD export XDP_LOADER export XDP_MONITOR export XDP_TRAFFICGEN TEST_DEFINITIONS="${1:-}" [ -f "$TEST_DEFINITIONS" ] || usage source "$TEST_DEFINITIONS" TOOL_TESTS_DIR="$(dirname "$TEST_DEFINITIONS")" shift trap teardown EXIT setup if [ "${DEBUG_TESTENV:-0}" -eq "1" ] && [ -n "$SHELL" ]; then echo "Entering interactive testenv debug - Ctrl-D to exit and resume test execution" $SHELL fi run_tests "$@" xdp-tools-1.6.1/lib/testing/xdp_adjust_tail.c000066400000000000000000000003441514310632100212120ustar00rootroot00000000000000#include #include SEC("xdp") int xdp_adjust_tail(struct xdp_md *ctx) { if (bpf_xdp_adjust_tail(ctx, -1) < 0) return XDP_ABORTED; return XDP_DROP; } char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/lib/testing/xdp_drop.c000066400000000000000000000002431514310632100176510ustar00rootroot00000000000000#include #include SEC("xdp") int xdp_drop(struct xdp_md *ctx) { return XDP_DROP; } char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/lib/testing/xdp_pass.c000066400000000000000000000004231514310632100176530ustar00rootroot00000000000000#include #include #include struct { __uint(priority, 10); __uint(XDP_PASS, 1); } XDP_RUN_CONFIG(xdp_pass); SEC("xdp") int xdp_pass(struct xdp_md *ctx) { return XDP_PASS; } char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/lib/util/000077500000000000000000000000001514310632100151675ustar00rootroot00000000000000xdp-tools-1.6.1/lib/util/Makefile000066400000000000000000000012101514310632100166210ustar00rootroot00000000000000include util.mk LIB_DIR ?= .. include $(LIB_DIR)/defines.mk include $(LIBXDP_DIR)/libxdp.mk all: $(UTIL_OBJS) UTIL_SKEL_H = $(UTIL_BPF_OBJS:.bpf.o=.skel.h) $(UTIL_OBJS): %.o: %.c %.h $(UTIL_SKEL_H) $(LIBMK) $(QUIET_CC)$(CC) $(CFLAGS) $(CPPFLAGS) -Wall -I../../headers -c -o $@ $< clean: $(Q)rm -f $(UTIL_OBJS) $(UTIL_BPF_OBJS) $(UTIL_SKEL_H) *.ll BPF_CFLAGS += -I$(HEADER_DIR) $(ARCH_INCLUDES) $(UTIL_BPF_OBJS): %.o: %.c $(KERN_USER_H) $(BPF_HEADERS) $(LIBMK) $(QUIET_CLANG)$(CLANG) -target $(BPF_TARGET) $(BPF_CFLAGS) -O2 -c -g -o $@ $< $(UTIL_SKEL_H): %.skel.h: %.bpf.o $(QUIET_GEN)$(BPFTOOL) gen skeleton $< name ${@:.skel.h=} > $@ xdp-tools-1.6.1/lib/util/compat.h000066400000000000000000000016551514310632100166320ustar00rootroot00000000000000#ifndef __COMPAT_H #define __COMPAT_H #include #include #ifndef HAVE_LIBBPF_BTF__TYPE_CNT static inline __u32 btf__type_cnt(const struct btf *btf) { /* old function didn't include 'void' type in count */ return btf__get_nr_types(btf) + 1; } #endif #ifndef HAVE_LIBBPF_BPF_PROGRAM__TYPE static inline enum bpf_prog_type bpf_program__type(const struct bpf_program *prog) { return bpf_program__get_type((struct bpf_program *)prog); } #endif #ifndef HAVE_LIBBPF_BPF_OBJECT__NEXT_PROGRAM static inline struct bpf_program *bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prog) { return bpf_program__next(prog, obj); } #endif #ifndef HAVE_LIBBPF_BPF_PROGRAM__EXPECTED_ATTACH_TYPE static inline enum bpf_attach_type bpf_program__expected_attach_type(const struct bpf_program *prog) { return bpf_program__get_expected_attach_type((struct bpf_program *)prog); } #endif #endif xdp-tools-1.6.1/lib/util/logging.c000066400000000000000000000036561514310632100167730ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #include #include #include #include #include "logging.h" #include "util.h" static enum logging_print_level log_level = LOG_INFO; static int print_func(enum logging_print_level level, int indent, const char *format, va_list args) { int i; if (level > log_level) return 0; for (i = 0; i < indent; i++) fprintf(stderr, " "); return vfprintf(stderr, format, args); } static int libbpf_print_func(enum libbpf_print_level level, const char *format, va_list args) { return print_func(level + 1, 2, format, args); } static int libbpf_silent_func(__unused enum libbpf_print_level level, __unused const char *format, __unused va_list args) { return 0; } static int libxdp_print_func(enum libxdp_print_level level, const char *format, va_list args) { return print_func(level + 1, 1, format, args); } static int libxdp_silent_func(__unused enum libxdp_print_level level, __unused const char *format, __unused va_list args) { return 0; } #define __printf(a, b) __attribute__((format(printf, a, b))) __printf(2, 3) void logging_print(enum logging_print_level level, const char *format, ...) { va_list args; va_start(args, format); print_func(level, 0, format, args); va_end(args); } void init_lib_logging(void) { libbpf_set_print(libbpf_print_func); libxdp_set_print(libxdp_print_func); } void silence_libbpf_logging(void) { if (log_level < LOG_VERBOSE) libbpf_set_print(libbpf_silent_func); } void silence_libxdp_logging(void) { if (log_level < LOG_VERBOSE) libxdp_set_print(libxdp_silent_func); } enum logging_print_level set_log_level(enum logging_print_level level) { enum logging_print_level old_level = log_level; log_level = level; return old_level; } enum logging_print_level increase_log_level(void) { if (log_level < LOG_VERBOSE) log_level++; return log_level; } xdp-tools-1.6.1/lib/util/logging.h000066400000000000000000000020321514310632100167630ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LOGGING_H #define __LOGGING_H /* This matches the libbpf logging levels, but with an additional VERBOSE level; * we demote all libbpf messages by one level so debug messages only show up on * VERBOSE. */ enum logging_print_level { LOG_WARN, LOG_INFO, LOG_DEBUG, LOG_VERBOSE, }; extern void logging_print(enum logging_print_level level, const char *format, ...) __attribute__((format(printf, 2, 3))); #define __pr(level, fmt, ...) \ do { \ logging_print(level, fmt, ##__VA_ARGS__); \ } while (0) #define pr_warn(fmt, ...) __pr(LOG_WARN, fmt, ##__VA_ARGS__) #define pr_info(fmt, ...) __pr(LOG_INFO, fmt, ##__VA_ARGS__) #define pr_debug(fmt, ...) __pr(LOG_DEBUG, fmt, ##__VA_ARGS__) void init_lib_logging(void); void silence_libbpf_logging(void); void silence_libxdp_logging(void); enum logging_print_level set_log_level(enum logging_print_level level); enum logging_print_level increase_log_level(); #endif xdp-tools-1.6.1/lib/util/params.c000066400000000000000000000431141514310632100166210ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include /* XDP_FLAGS_* depend on kernel-headers installed */ #include #include #include "params.h" #include "logging.h" #include "util.h" #define BUFSIZE 30 #define FIRST_PRINTABLE 65 /* ord('A') = 65 */ #define VERSION_SHORT_OPT 0 static bool opt_needs_arg(const struct prog_option *opt) { return opt->type > OPT_BOOL && !opt->positional; } static bool opt_is_multi(const struct prog_option *opt) { return opt->type == OPT_MULTISTRING || opt->type == OPT_IFNAME_MULTI || opt->type == OPT_U32_MULTI; } static int handle_bool(__unused char *optarg, void *tgt, __unused struct prog_option *opt) { bool *opt_set = tgt; *opt_set = true; return 0; } static int handle_string(char *optarg, void *tgt, __unused struct prog_option *opt) { char **opt_set = tgt; *opt_set = optarg; return 0; } static int handle_multistring(char *optarg, void *tgt, __unused struct prog_option *opt) { struct multistring *opt_set = tgt; void *ptr; if (opt_set->num_strings +1 > SIZE_MAX / sizeof(*opt_set->strings)) return -ENOMEM; ptr = realloc(opt_set->strings, sizeof(*opt_set->strings) * (opt_set->num_strings +1)); if (!ptr) return -errno; opt_set->strings = ptr; opt_set->strings[opt_set->num_strings++] = optarg; return 0; } static int handle_u8(char *optarg, void *tgt, __unused struct prog_option *opt) { __u8 *opt_set = tgt; unsigned long val; errno = 0; val = strtoul(optarg, NULL, opt->hex ? 16 : 10); if (errno || val > 0xff) return -EINVAL; *opt_set = val; return 0; } static int handle_u16(char *optarg, void *tgt, __unused struct prog_option *opt) { __u16 *opt_set = tgt; unsigned long val; errno = 0; val = strtoul(optarg, NULL, opt->hex ? 16 : 10); if (errno || val > 0xffff) return -EINVAL; *opt_set = val; return 0; } static int handle_u32(char *optarg, void *tgt, __unused struct prog_option *opt) { __u32 *opt_set = tgt; unsigned long val; errno = 0; val = strtoul(optarg, NULL, opt->hex ? 16 : 10); if (errno || val > 0xffffffff) return -EINVAL; *opt_set = val; return 0; } static int handle_u32_multi(char *optarg, void *tgt, struct prog_option *opt) { struct u32_multi *opt_set = tgt; __u32 val; void *ptr; int ret; if (opt_set->num_vals +1 > SIZE_MAX / sizeof(*opt_set->vals)) return -ENOMEM; ret = handle_u32(optarg, &val, opt); if (ret) return ret; ptr = realloc(opt_set->vals, sizeof(*opt_set->vals) * (opt_set->num_vals +1)); if (!ptr) return -errno; opt_set->vals = ptr; opt_set->vals[opt_set->num_vals++] = val; return 0; } static int handle_u64(char *optarg, void *tgt, __unused struct prog_option *opt) { __u64 *opt_set = tgt; unsigned long long val; errno = 0; val = strtoull(optarg, NULL, opt->hex ? 16 : 10); if (errno) return -EINVAL; *opt_set = val; return 0; } static int parse_mac(char *str, unsigned char mac[ETH_ALEN]) { unsigned int v[ETH_ALEN]; int len, i; /* Based on https://stackoverflow.com/a/20553913 */ len = sscanf(str, "%x:%x:%x:%x:%x:%x%*c", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); if (len != ETH_ALEN) return -EINVAL; for (i = 0; i < ETH_ALEN; i++) { if (v[i] > 0xFF) return -EINVAL; mac[i] = v[i]; } return 0; } static int handle_macaddr(char *optarg, void *tgt, __unused struct prog_option *opt) { struct mac_addr *opt_set = tgt; int err; err = parse_mac(optarg, opt_set->addr); if (err) pr_warn("Invalid MAC address: %s\n", optarg); return err; } void print_macaddr(char *buf, size_t buf_len, const struct mac_addr *addr) { int i, len; for (i = 0; buf_len > 0 && i < ETH_ALEN; i++) { len = snprintf(buf, buf_len, "%02x", addr->addr[i]); if (len < 0 || (size_t)len >= buf_len) break; buf += len; buf_len -= len; if (i < ETH_ALEN - 1) { *buf++ = ':'; buf_len -= 1; } } *buf = '\0'; } bool macaddr_is_null(const struct mac_addr *addr) { static struct mac_addr nulladdr = {}; return memcmp(addr, &nulladdr, sizeof(nulladdr)) == 0; } static const struct flag_val *find_flag(const struct flag_val *flag_vals, const char *chr) { while (flag_vals->flagstring) { if (strcmp(chr, flag_vals->flagstring) == 0) return flag_vals; flag_vals++; } return NULL; } static int handle_flags(char *optarg, void *tgt, struct prog_option *opt) { const struct flag_val *flag, *flag_vals = opt->typearg; unsigned int *opt_set = tgt; unsigned int flagval = 0; char *c = NULL; while (*optarg) { c = strchr(optarg, ','); if (c) *c = '\0'; flag = find_flag(flag_vals, optarg); if (!flag) return -EINVAL; flagval |= flag->flagval; if (!c) break; optarg = c + 1; } *opt_set = flagval; return 0; } static int get_ifindex(const char *ifname) { int ifindex; ifindex = if_nametoindex(ifname); if (!ifindex) { pr_warn("Couldn't find network interface '%s'.\n", ifname); return -ENOENT; } return ifindex; } static int handle_ifname(char *optarg, void *tgt, __unused struct prog_option *opt) { struct iface *iface = tgt; int ifindex; ifindex = get_ifindex(optarg); if (ifindex < 0) return ifindex; iface->ifname = optarg; iface->ifindex = ifindex; return 0; } static int handle_ifname_multi(char *optarg, void *tgt, __unused struct prog_option *opt) { struct iface **ifaces = tgt; struct iface *iface, *tmp; int ifindex; ifindex = get_ifindex(optarg); if (ifindex < 0) return ifindex; iface = calloc(1, sizeof(*iface)); if (!iface) return -ENOMEM; iface->ifname = optarg; iface->ifindex = ifindex; if (!*ifaces) { *ifaces = iface; return 0; } tmp = *ifaces; while(tmp->next) tmp = tmp->next; tmp->next = iface; return 0; } void print_addr(char *buf, size_t buf_len, const struct ip_addr *addr) { inet_ntop(addr->af, &addr->addr, buf, buf_len); } bool ipaddr_is_null(const struct ip_addr *addr) { static struct ip_addr nulladdr = {}; return memcmp(addr, &nulladdr, sizeof(nulladdr)) == 0; } static int handle_ipaddr(char *optarg, void *tgt, __unused struct prog_option *opt) { struct ip_addr *addr = tgt; int af; af = strchr(optarg, ':') ? AF_INET6 : AF_INET; if (inet_pton(af, optarg, &addr->addr) != 1) { pr_warn("Invalid IP address: %s\n", optarg); return -ENOENT; /* caller won't print error on ENOENT */ } addr->af = af; return 0; } static const struct enum_val *find_enum(const struct enum_val *enum_vals, const char *chr) { while (enum_vals->name) { if (strcmp(chr, enum_vals->name) == 0) return enum_vals; enum_vals++; } return NULL; } static int handle_enum(char *optarg, void *tgt, struct prog_option *opt) { const struct enum_val *val, *all_vals = opt->typearg; unsigned int *opt_set = tgt; val = find_enum(all_vals, optarg); if (!val) return -EINVAL; *opt_set = val->value; return 0; } static void print_enum_vals(char *buf, size_t buf_len, const struct enum_val *vals) { const struct enum_val *val; bool first = true; for (val = vals; buf_len && val->name; val++) { int len; if (!first) { *buf++ = ','; buf_len--; } first = false; len = snprintf(buf, buf_len, "%s", val->name); if (len < 0 || (size_t)len >= buf_len) break; buf += len; buf_len -= len; } *buf = '\0'; } const char *get_enum_name(const struct enum_val *vals, unsigned int value) { const struct enum_val *val; for (val = vals; val->name; val++) if (val->value == value) return val->name; return NULL; } static const struct opthandler { int (*func)(char *optarg, void *tgt, struct prog_option *opt); } handlers[__OPT_MAX] = { {NULL}, {handle_bool}, {handle_flags}, {handle_string}, {handle_u8}, {handle_u16}, {handle_u32}, {handle_u32_multi}, {handle_u64}, {handle_macaddr}, {handle_ifname}, {handle_ifname_multi}, {handle_ipaddr}, {handle_enum}, {handle_multistring} }; void print_flags(char *buf, size_t buf_len, const struct flag_val *flags, unsigned long flags_set) { const struct flag_val *flag; bool first = true; for (flag = flags; buf_len && flag->flagstring; flag++) { int len; if (!(flag->flagval & flags_set)) continue; if (!first) { *buf++ = ','; buf_len--; } first = false; len = snprintf(buf, buf_len, "%s", flag->flagstring); if (len < 0 || (size_t)len >= buf_len) break; buf += len; buf_len -= len; } *buf = '\0'; } static void print_help_flags(const struct prog_option *opt) { char buf[100] = {}; if (!opt->typearg) pr_warn("Missing typearg for opt %s\n", opt->name); else print_flags(buf, sizeof(buf), opt->typearg, -1); printf(" %s (valid values: %s)", opt->help, buf); } static void print_help_enum(const struct prog_option *opt) { char buf[100] = {}; if (!opt->typearg) pr_warn("Missing typearg for opt %s\n", opt->name); else print_enum_vals(buf, sizeof(buf), opt->typearg); printf(" %s (valid values: %s)", opt->help, buf); } static const struct helprinter { void (*func)(const struct prog_option *opt); } help_printers[__OPT_MAX] = { {NULL}, {NULL}, {print_help_flags}, {NULL}, {NULL}, {NULL}, {NULL}, {NULL}, {NULL}, {NULL}, {NULL}, {NULL}, {NULL}, {print_help_enum}, {NULL} }; static void _print_positional(const struct prog_option *long_options) { const struct prog_option *opt; FOR_EACH_OPTION (long_options, opt) { if (!opt->positional) continue; printf(" %s", opt->metavar ?: opt->name); } } static void _print_options(const struct prog_option *poptions, bool required) { const struct prog_option *opt; FOR_EACH_OPTION (poptions, opt) { if (opt->required != required || opt->hidden) continue; if (opt->positional) { printf(" %-30s", opt->metavar ?: opt->name); } else { char buf[BUFSIZE]; int pos; if (opt->short_opt >= FIRST_PRINTABLE) printf(" -%c,", opt->short_opt); else printf(" "); pos = snprintf(buf, BUFSIZE, " --%s", opt->name); if (pos < 0 || pos >= BUFSIZE) { pr_warn("opt name too long: %s\n", opt->name); continue; } if (opt->metavar) snprintf(&buf[pos], BUFSIZE - pos, " %s", opt->metavar); printf("%-28s", buf); } if (help_printers[opt->type].func != NULL) help_printers[opt->type].func(opt); else if (opt->help) printf(" %s", opt->help); printf("\n"); } } bool is_prefix(const char *pfx, const char *str) { if (!pfx) return false; if (strlen(str) < strlen(pfx)) return false; return !memcmp(str, pfx, strlen(pfx)); } void usage(const char *prog_name, const char *doc, const struct prog_option *poptions, bool full) { const struct prog_option *opt; int num_req = 0; printf("\nUsage: %s [options]", prog_name); _print_positional(poptions); printf("\n"); if (!full) { printf("Use --help (or -h) to see full option list.\n"); return; } FOR_EACH_OPTION (poptions, opt) if (opt->required) num_req++; printf("\n %s\n\n", doc); if (num_req) { printf("Required parameters:\n"); _print_options(poptions, true); printf("\n"); } printf("Options:\n"); _print_options(poptions, false); printf(" -v, --verbose Enable verbose logging (-vv: more verbose)\n"); printf(" --version Display version information\n"); printf(" -h, --help Show this help\n"); printf("\n"); } static int prog_options_to_options(struct prog_option *poptions, struct option **options, char **optstring) { int num = 0, num_cmn = 0, n_sopt = VERSION_SHORT_OPT + 1; struct option *new_options, *nopt; struct prog_option *opt; char buf[100], *c = buf, *i; struct option common_opts[] = { {"help", no_argument, NULL, 'h'}, {"verbose", no_argument, NULL, 'v'}, {"version", no_argument, NULL, VERSION_SHORT_OPT}, {} }; for (nopt = common_opts; nopt->name; nopt++) { num++; num_cmn++; if (nopt->val != VERSION_SHORT_OPT) *c++ = nopt->val; } FOR_EACH_OPTION (poptions, opt) if (!opt->positional) num++; new_options = calloc(num + 1, sizeof(struct option)); if (!new_options) return -ENOMEM; memcpy(new_options, &common_opts, sizeof(struct option) * num_cmn); nopt = new_options + num_cmn; FOR_EACH_OPTION (poptions, opt) { if (opt->positional) continue; if (opt->short_opt) { for (i = buf; i < c; i++) { if (*i == opt->short_opt) { pr_warn("Duplicate option char: %c\n", opt->short_opt); goto err; } } *(c++) = opt->short_opt; if (opt_needs_arg(opt)) *(c++) = ':'; } else { /* getopt expects options to have unique values in the * 'val' field, however we want to be able to define * options that don't have a short opt. So get around * that, just number such options sequentially. */ if (n_sopt >= FIRST_PRINTABLE) { pr_warn("Too many options with no short opt\n"); goto err; } opt->short_opt = n_sopt++; } nopt->has_arg = opt_needs_arg(opt) ? required_argument : no_argument; nopt->name = opt->name; nopt->val = opt->short_opt; nopt->flag = NULL; nopt++; } *(c++) = '\0'; *optstring = strdup(buf); if (!*optstring) goto err; /* Make sure we clear the last option, or else we crash. */ memset(new_options + num, 0, sizeof(struct option)); *options = new_options; return 0; err: free(new_options); return -EINVAL; } static struct prog_option *find_opt(struct prog_option *all_opts, int optchar) { struct prog_option *opt; FOR_EACH_OPTION (all_opts, opt) if (opt->short_opt == optchar) return opt; return NULL; } static int _set_opt(void *cfg, struct prog_option *opt, char *optarg) { int ret; if (opt->max_num && opt->num_set + 1 > opt->max_num) { pr_warn("Too many parameters for %s (max %u)\n", opt->metavar ?: opt->name, opt->max_num); return -E2BIG; } ret = handlers[opt->type].func(optarg, (cfg + opt->cfg_offset), opt); if (!ret) opt->num_set++; else if (ret != -ENOENT) pr_warn("Couldn't parse option %s: %s.\n", opt->name, strerror(-ret)); return ret; } static int set_opt(void *cfg, struct prog_option *all_opts, int optchar, char *optarg) { struct prog_option *opt; if (!cfg) return -EFAULT; opt = find_opt(all_opts, optchar); if (!opt) return -ENOENT; return _set_opt(cfg, opt, optarg); } static int set_pos_opt(void *cfg, struct prog_option *all_opts, char *optarg) { struct prog_option *o, *opt = NULL; FOR_EACH_OPTION (all_opts, o) { if (o->positional && (!o->num_set || opt_is_multi(o))) { opt = o; break; } } if (!opt) return -ENOENT; return _set_opt(cfg, opt, optarg); } int parse_cmdline_args(int argc, char **argv, struct prog_option *poptions, void *cfg, size_t cfg_size, const char *prog, const char *usage_cmd, const char *doc, const void *defaults) { struct prog_option *opt_iter; struct option *long_options; bool full_help = false; int i, opt, err = 0; int longindex = 0; char *optstring; if (prog_options_to_options(poptions, &long_options, &optstring)) { pr_warn("Error preparing options\n"); return -ENOMEM; } if (defaults) memcpy(cfg, defaults, cfg_size); /* Parse commands line args */ while ((opt = getopt_long(argc, argv, optstring, long_options, &longindex)) != -1) { switch (opt) { case 'h': usage(usage_cmd, doc, poptions, true); err = EXIT_FAILURE; goto out; case 'v': increase_log_level(); break; case VERSION_SHORT_OPT: printf("%s version %s using libbpf version %s\n", prog, TOOLS_VERSION, get_libbpf_version()); err = EXIT_FAILURE; goto out; default: if (set_opt(cfg, poptions, opt, optarg)) { usage(usage_cmd, doc, poptions, full_help); err = EXIT_FAILURE; goto out; } break; } } for (i = optind; i < argc; i++) { if (set_pos_opt(cfg, poptions, argv[i])) { usage(usage_cmd, doc, poptions, full_help); err = EXIT_FAILURE; goto out; } } FOR_EACH_OPTION (poptions, opt_iter) { if (opt_iter->num_set && (!opt_iter->min_num || opt_iter->num_set >= opt_iter->min_num)) continue; if (opt_iter->required) { if (opt_iter->positional) pr_warn("Missing required parameter %s\n", opt_iter->metavar ?: opt_iter->name); else pr_warn("Missing required option '--%s'\n", opt_iter->name); usage(usage_cmd, doc, poptions, full_help); err = EXIT_FAILURE; goto out; } } out: free(long_options); free(optstring); return err; } int dispatch_commands(const char *argv0, int argc, char **argv, const struct prog_command *cmds, size_t cfg_size, const char *prog_name, bool needs_bpffs) { const struct prog_command *c, *cmd = NULL; int ret = EXIT_FAILURE, err, len; char pin_root_path[PATH_MAX]; char usagebuf[100]; int candidates = 0; void *cfg; for (c = cmds; c->name; c++) { if (is_prefix(argv0, c->name)) { cmd = c; candidates++; if (!strcmp(argv0, c->name)) break; } } if (!cmd || (candidates > 1 && strcmp(argv0, cmd->name))) { pr_warn("Command '%s' is %s, try '%s help'.\n", argv0, cmd ? "ambiguous" : "unknown", prog_name); return EXIT_FAILURE; } if (cmd->no_cfg) return cmd->func(NULL, NULL); cfg = calloc(1, cfg_size); if (!cfg) { pr_warn("Couldn't allocate memory\n"); return EXIT_FAILURE; } len = snprintf(usagebuf, sizeof(usagebuf), "%s %s", prog_name, cmd->name); if (len < 0 || (size_t)len >= sizeof(usagebuf)) goto out; err = parse_cmdline_args(argc, argv, cmd->options, cfg, cfg_size, prog_name, usagebuf, cmd->doc, cmd->default_cfg); if (err) goto out; err = get_bpf_root_dir(pin_root_path, sizeof(pin_root_path), prog_name, needs_bpffs); if (err && needs_bpffs) goto out; err = check_bpf_environ(); if (err) goto out; ret = cmd->func(cfg, pin_root_path); out: free(cfg); return ret; } xdp-tools-1.6.1/lib/util/params.h000066400000000000000000000072451514310632100166330ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __PARAMS_H #define __PARAMS_H #include #include #include #include #include #include #include enum option_type { OPT_NONE, OPT_BOOL, OPT_FLAGS, OPT_STRING, OPT_U8, OPT_U16, OPT_U32, OPT_U32_MULTI, OPT_U64, OPT_MACADDR, OPT_IFNAME, OPT_IFNAME_MULTI, OPT_IPADDR, OPT_ENUM, OPT_MULTISTRING, __OPT_MAX }; struct prog_option { enum option_type type; size_t cfg_size; size_t cfg_offset; size_t opt_size; char *name; char short_opt; char *help; char *metavar; void *typearg; bool required; bool positional; bool hidden; bool hex; unsigned int min_num; unsigned int max_num; unsigned int num_set; }; struct flag_val { const char *flagstring; unsigned int flagval; }; struct enum_val { const char *name; unsigned int value; }; struct multistring { const char **strings; size_t num_strings; }; struct u32_multi { __u32 *vals; size_t num_vals; }; struct iface { struct iface *next; char *ifname; int ifindex; }; struct ip_addr { int af; union { struct in_addr addr4; struct in6_addr addr6; } addr; }; struct mac_addr { unsigned char addr[ETH_ALEN]; }; #define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) #define DEFINE_OPTION(_name, _type, _cfgtype, _cfgmember, ...) \ { \ .cfg_size = sizeof(_cfgtype), \ .opt_size = sizeof_field(_cfgtype, _cfgmember), \ .cfg_offset = offsetof(_cfgtype, _cfgmember), .name = _name, \ .type = _type, __VA_ARGS__ \ } #define END_OPTIONS \ { \ } #define FOR_EACH_OPTION(_options, _opt) \ for (_opt = _options; _opt->type != OPT_NONE; _opt++) struct prog_command { const char *name; int (*func)(const void *cfg, const char *pin_root_path); struct prog_option *options; const void *default_cfg; char *doc; bool no_cfg; }; #define DEFINE_COMMAND_NAME(_name, _func, _doc) \ { \ .name = _name, .func = do_##_func, \ .options = _func##_options, .default_cfg = &defaults_##_func, \ .doc = _doc \ } #define DEFINE_COMMAND(_name, _doc) DEFINE_COMMAND_NAME(textify(_name), _name, _doc) #define DEFINE_COMMAND_NODEF(_name, _doc) \ { \ .name = textify(_name), .func = do_##_name, \ .options = _name##_options, .doc = _doc \ } #define END_COMMANDS \ { \ } const char *get_enum_name(const struct enum_val *vals, unsigned int value); void print_flags(char *buf, size_t buf_len, const struct flag_val *flags, unsigned long flags_val); void print_addr(char *buf, size_t buf_len, const struct ip_addr *addr); void print_macaddr(char *buf, size_t buf_len, const struct mac_addr *addr); bool macaddr_is_null(const struct mac_addr *addr); bool ipaddr_is_null(const struct ip_addr *addr); bool is_prefix(const char *prefix, const char *string); void usage(const char *prog_name, const char *doc, const struct prog_option *long_options, bool full); int parse_cmdline_args(int argc, char **argv, struct prog_option *long_options, void *cfg, size_t cfg_size, const char *prog, const char *usage_cmd, const char *doc, const void *defaults); int dispatch_commands(const char *argv0, int argc, char **argv, const struct prog_command *cmds, size_t cfg_size, const char *prog_name, bool needs_bpffs); #endif /* __COMMON_PARAMS_H */ xdp-tools-1.6.1/lib/util/stats.c000066400000000000000000000140261514310632100164740ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #include #include #include #include #include #include #include #include #include #include #include "stats.h" #include "util.h" #include "logging.h" #define NANOSEC_PER_SEC 1000000000 /* 10^9 */ static int gettime(__u64 *nstime) { struct timespec t; int res; res = clock_gettime(CLOCK_MONOTONIC, &t); if (res < 0) { pr_warn("Error with gettimeofday! (%i)\n", res); return res; } *nstime = (__u64)t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; return 0; } static double calc_period(struct record *r, struct record *p) { double period_ = 0; __u64 period = 0; period = r->timestamp - p->timestamp; if (period > 0) period_ = ((double)period / NANOSEC_PER_SEC); return period_; } int stats_print_one(struct stats_record *stats_rec) { __u64 packets, bytes; struct record *rec; int i, err; /* Print for each XDP actions stats */ for (i = 0; i < XDP_ACTION_MAX; i++) { char *fmt = " %-35s %'11lld pkts %'11lld KiB\n"; const char *action = action2str(i); rec = &stats_rec->stats[i]; packets = rec->total.rx_packets; bytes = rec->total.rx_bytes; if (rec->enabled) { err = printf(fmt, action, packets, bytes / 1024); if (err < 0) return err; } } return 0; } int stats_print(struct stats_record *stats_rec, struct stats_record *stats_prev) { struct record *rec, *prev; __u64 packets, bytes; struct timespec t; bool first = true; double period; double pps; /* packets per sec */ double bps; /* bits per sec */ int i, err; err = clock_gettime(CLOCK_REALTIME, &t); if (err < 0) { pr_warn("Error with gettimeofday! (%i)\n", err); return err; } /* Print for each XDP actions stats */ for (i = 0; i < XDP_ACTION_MAX; i++) { char *fmt = "%-12s %'11lld pkts (%'10.0f pps)" " %'11lld KiB (%'6.0f Mbits/s)\n"; const char *action = action2str(i); rec = &stats_rec->stats[i]; prev = &stats_prev->stats[i]; if (!rec->enabled) continue; packets = rec->total.rx_packets - prev->total.rx_packets; bytes = rec->total.rx_bytes - prev->total.rx_bytes; period = calc_period(rec, prev); if (period == 0) return 0; if (first) { printf("Period of %fs ending at %ld.%06ld\n", period, (long) t.tv_sec, (long) t.tv_nsec / 1000); first = false; } pps = packets / period; bps = (bytes * 8) / period / 1000000; printf(fmt, action, rec->total.rx_packets, pps, rec->total.rx_bytes / 1024, bps, period); } printf("\n"); return 0; } /* BPF_MAP_TYPE_ARRAY */ static int map_get_value_array(int fd, __u32 key, struct xdp_stats_record *value) { int err = 0; err = bpf_map_lookup_elem(fd, &key, value); if (err) pr_debug("bpf_map_lookup_elem failed key:0x%X\n", key); return err; } /* BPF_MAP_TYPE_PERCPU_ARRAY */ static int map_get_value_percpu_array(int fd, __u32 key, struct xdp_stats_record *value) { /* For percpu maps, userspace gets a value per possible CPU */ int nr_cpus = libbpf_num_possible_cpus(); struct xdp_stats_record *values; __u64 sum_bytes = 0; __u64 sum_pkts = 0; int i, err; if (nr_cpus < 0) return nr_cpus; values = calloc(nr_cpus, sizeof(*values)); if (!values) return -ENOMEM; err = bpf_map_lookup_elem(fd, &key, values); if (err) { pr_debug("bpf_map_lookup_elem failed key:0x%X\n", key); goto out; } /* Sum values from each CPU */ for (i = 0; i < nr_cpus; i++) { sum_pkts += values[i].rx_packets; sum_bytes += values[i].rx_bytes; } value->rx_packets = sum_pkts; value->rx_bytes = sum_bytes; out: free(values); return err; } static int map_collect(int fd, __u32 map_type, __u32 key, struct record *rec) { struct xdp_stats_record value = {}; int err; /* Get time as close as possible to reading map contents */ err = gettime(&rec->timestamp); if (err) return err; switch (map_type) { case BPF_MAP_TYPE_ARRAY: err = map_get_value_array(fd, key, &value); break; case BPF_MAP_TYPE_PERCPU_ARRAY: err = map_get_value_percpu_array(fd, key, &value); break; default: pr_warn("Unknown map_type: %u cannot handle\n", map_type); err = -EINVAL; break; } if (err) return err; rec->total.rx_packets = value.rx_packets; rec->total.rx_bytes = value.rx_bytes; return 0; } int stats_collect(int map_fd, __u32 map_type, struct stats_record *stats_rec) { /* Collect all XDP actions stats */ __u32 key; int err; for (key = 0; key < XDP_ACTION_MAX; key++) { if (!stats_rec->stats[key].enabled) continue; err = map_collect(map_fd, map_type, key, &stats_rec->stats[key]); if (err) return err; } return 0; } static int check_map_pin(__u32 map_id, const char *pin_dir, const char *map_name) { struct bpf_map_info info = {}; int fd, ret = 0; fd = get_pinned_map_fd(pin_dir, map_name, &info); if (fd < 0) { if (fd == -ENOENT) pr_warn("Stats map disappeared while polling\n"); else pr_warn("Unable to re-open stats map\n"); return fd; } if (info.id != map_id) { pr_warn("Stats map ID changed while polling\n"); ret = -EINVAL; } close(fd); return ret; } int stats_poll(int map_fd, int interval, bool *exit, const char *pin_dir, const char *map_name) { struct bpf_map_info info = {}; struct stats_record prev, record = { 0 }; __u32 info_len = sizeof(info); __u32 map_type, map_id; int err; record.stats[XDP_DROP].enabled = true; record.stats[XDP_PASS].enabled = true; record.stats[XDP_REDIRECT].enabled = true; record.stats[XDP_TX].enabled = true; if (!interval) return -EINVAL; err = bpf_obj_get_info_by_fd(map_fd, &info, &info_len); if (err) return -errno; map_type = info.type; map_id = info.id; /* Get initial reading quickly */ stats_collect(map_fd, map_type, &record); usleep(1000000 / 4); while (!*exit) { if (pin_dir) { err = check_map_pin(map_id, pin_dir, map_name); if (err) return err; } memset(&info, 0, sizeof(info)); prev = record; /* struct copy */ stats_collect(map_fd, map_type, &record); err = stats_print(&record, &prev); if (err) return err; usleep(interval * 1000); } return 0; } xdp-tools-1.6.1/lib/util/stats.h000066400000000000000000000012061514310632100164750ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __STATS_H #define __STATS_H #include #include "xdp/xdp_stats_kern_user.h" struct record { __u64 timestamp; bool enabled; struct xdp_stats_record total; /* defined in common_kern_user.h */ }; struct stats_record { struct record stats[XDP_ACTION_MAX]; }; int stats_print_one(struct stats_record *stats_rec); int stats_print(struct stats_record *stats_rec, struct stats_record *stats_prev); int stats_collect(int map_fd, __u32 map_type, struct stats_record *stats_rec); int stats_poll(int map_fd, int interval, bool *exit, const char *pin_dir, const char *map_name); #endif xdp-tools-1.6.1/lib/util/util.c000066400000000000000000000465171514310632100163250ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include /* Need XDP flags */ #include /* BPF FS magic */ #include /* ERR_PTR */ #include #include #include #include "util.h" #include "logging.h" static struct enum_val xdp_modes[] = { {"native", XDP_MODE_NATIVE}, {"skb", XDP_MODE_SKB}, {"hw", XDP_MODE_HW}, {"unspecified", XDP_MODE_UNSPEC}, {NULL, 0} }; int try_snprintf(char *buf, size_t buf_len, const char *format, ...) { va_list args; int len; va_start(args, format); len = vsnprintf(buf, buf_len, format, args); va_end(args); if (len < 0) return -EINVAL; else if ((size_t)len >= buf_len) return -ENAMETOOLONG; return 0; } static int set_rlimit(unsigned int min_limit) { struct rlimit limit; int err = 0; err = getrlimit(RLIMIT_MEMLOCK, &limit); if (err) { err = -errno; pr_warn("Couldn't get current rlimit\n"); return err; } if (limit.rlim_cur == RLIM_INFINITY || limit.rlim_cur == 0) { pr_debug("Current rlimit is infinity or 0. Not raising\n"); return -ENOMEM; } if (min_limit) { if (limit.rlim_cur >= min_limit) { pr_debug("Current rlimit %ju already >= minimum %u\n", (uintmax_t)limit.rlim_cur, min_limit); return 0; } pr_debug("Setting rlimit to minimum %u\n", min_limit); limit.rlim_cur = min_limit; } else { pr_debug("Doubling current rlimit of %ju\n", (uintmax_t)limit.rlim_cur); limit.rlim_cur <<= 1; } limit.rlim_max = max(limit.rlim_cur, limit.rlim_max); err = setrlimit(RLIMIT_MEMLOCK, &limit); if (err) { err = -errno; pr_warn("Couldn't raise rlimit: %s\n", strerror(-err)); return err; } return 0; } int double_rlimit(void) { pr_debug("Permission denied when loading eBPF object; " "raising rlimit and retrying\n"); return set_rlimit(0); } static const char *_libbpf_compile_version = LIBBPF_VERSION; static char _libbpf_version[10] = {}; const char *get_libbpf_version(void) { /* Start by copying compile-time version into buffer so we have a * fallback value in case we are dynamically linked, or can't find a * version in /proc/self/maps below. */ strncpy(_libbpf_version, _libbpf_compile_version, sizeof(_libbpf_version)-1); #ifdef LIBBPF_DYNAMIC char path[PATH_MAX], buf[PATH_MAX], *s; bool found = false; FILE *fp; /* When dynamically linking against libbpf, we can't be sure that the * version we discovered at compile time is actually the one we are * using at runtime. This can lead to hard-to-debug errors, so we try to * discover the correct version at runtime. * * The simple solution to this would be if libbpf itself exported a * version in its API. But since it doesn't, we work around this by * parsing the mappings of the binary at runtime, looking for the full * filename of libbpf.so and using that. */ fp = fopen("/proc/self/maps", "r"); if (fp == NULL) goto out; while ((s = fgets(buf, sizeof(buf), fp)) != NULL) { /* We are looking for a line like: * 7f63c2105000-7f63c2106000 rw-p 00032000 fe:02 4200947 /usr/lib/libbpf.so.0.1.0 */ if (sscanf(s, "%*x-%*x %*4c %*x %*5c %*d %s\n", path) == 1 && (s = strstr(path, "libbpf.so.")) != NULL) { strncpy(_libbpf_version, s+10, sizeof(_libbpf_version)-1); found = true; break; } } fclose(fp); out: if (!found) pr_warn("Couldn't find runtime libbpf version - falling back to compile-time value!\n"); #endif _libbpf_version[sizeof(_libbpf_version)-1] = '\0'; return _libbpf_version; } static bool try_bpf_file(char *buf, size_t buf_size, char *path, const char *progname) { struct stat sb = {}; if (try_snprintf(buf, buf_size, "%s/%s", path, progname)) return false; pr_debug("Looking for '%s'\n", buf); if (stat(buf, &sb)) return false; return true; } int find_bpf_file(char *buf, size_t buf_size, const char *progname) { static char *bpf_obj_paths[] = { #ifdef DEBUG ".", #endif BPF_OBJECT_PATH, NULL }; char *path, **p; path = secure_getenv(XDP_OBJECT_ENVVAR); if (path && try_bpf_file(buf, buf_size, path, progname)) { return 0; } else if (!path) { for (p = bpf_obj_paths; *p; p++) if (try_bpf_file(buf, buf_size, *p, progname)) return 0; } pr_warn("Couldn't find a BPF file with name %s\n", progname); return -ENOENT; } struct bpf_object *open_bpf_file(const char *progname, struct bpf_object_open_opts *opts) { char buf[PATH_MAX]; int err; err = find_bpf_file(buf, sizeof(buf), progname); if (err) return ERR_PTR(err); pr_debug("Loading bpf file '%s' from '%s'\n", progname, buf); return bpf_object__open_file(buf, opts); } static int get_pinned_object_fd(const char *path, void *info, __u32 *info_len) { char errmsg[STRERR_BUFSIZE]; int pin_fd, err; pin_fd = bpf_obj_get(path); if (pin_fd < 0) { err = -errno; libbpf_strerror(-err, errmsg, sizeof(errmsg)); pr_debug("Couldn't retrieve pinned object '%s': %s\n", path, errmsg); return err; } if (info) { err = bpf_obj_get_info_by_fd(pin_fd, info, info_len); if (err) { err = -errno; libbpf_strerror(-err, errmsg, sizeof(errmsg)); pr_debug("Couldn't retrieve object info: %s\n", errmsg); return err; } } return pin_fd; } int make_dir_subdir(const char *parent, const char *dir) { char path[PATH_MAX]; int err; err = try_snprintf(path, sizeof(path), "%s/%s", parent, dir); if (err) return err; err = mkdir(parent, S_IRWXU); if (err && errno != EEXIST) { err = -errno; return err; } err = mkdir(path, S_IRWXU); if (err && errno != EEXIST) { err = -errno; return err; } return 0; } int attach_xdp_program(struct xdp_program *prog, const struct iface *iface, enum xdp_attach_mode mode, const char *pin_root_path) { char pin_path[PATH_MAX]; int err = 0; if (!prog || !pin_root_path) return -EINVAL; err = make_dir_subdir(pin_root_path, "programs"); if (err) { pr_warn("Unable to create pin directory: %s\n", strerror(-err)); return err; } err = try_snprintf(pin_path, sizeof(pin_path), "%s/programs/%s/%s", pin_root_path, iface->ifname, xdp_program__name(prog)); if (err) return err; err = xdp_program__attach(prog, iface->ifindex, mode, 0); if (err) { if (pin_root_path && err != -EEXIST) unlink(pin_path); return err; } pr_debug("Program '%s' loaded on interface '%s'%s\n", xdp_program__name(prog), iface->ifname, mode == XDP_MODE_SKB ? " in skb mode" : ""); err = xdp_program__pin(prog, pin_path); if (err) { pr_warn("Unable to pin XDP program at %s: %s\n", pin_path, strerror(-err)); goto unload; } pr_debug("XDP program pinned at %s\n", pin_path); return err; unload: xdp_program__detach(prog, iface->ifindex, mode, 0); return err; } int detach_xdp_program(struct xdp_program *prog, const struct iface *iface, enum xdp_attach_mode mode, const char *pin_root_path) { char pin_path[PATH_MAX]; int err; err = xdp_program__detach(prog, iface->ifindex, mode, 0); if (err) goto out; err = try_snprintf(pin_path, sizeof(pin_path), "%s/programs/%s/%s", pin_root_path, iface->ifname, xdp_program__name(prog)); if (err) return err; err = unlink(pin_path); if (err && errno != ENOENT) goto out; err = try_snprintf(pin_path, sizeof(pin_path), "%s/programs/%s", pin_root_path, iface->ifname); if (err) goto out; err = rmdir(pin_path); if (err && errno == ENOENT) err = 0; else if (err) err = -errno; out: return err; } int get_pinned_program(const struct iface *iface, const char *pin_root_path, enum xdp_attach_mode *mode, struct xdp_program **xdp_prog) { int ret = -ENOENT, err, ifindex = iface->ifindex; char pin_path[PATH_MAX]; bool remove_all = false; enum xdp_attach_mode m; struct dirent *de; DIR *dr; err = try_snprintf(pin_path, sizeof(pin_path), "%s/programs/%s", pin_root_path, iface->ifname); if (err) return err; dr = opendir(pin_path); if (!dr) { err = -errno; pr_debug("Couldn't open pin directory %s: %s\n", pin_path, strerror(-err)); return err; } if (!ifindex) ifindex = if_nametoindex(iface->ifname); if (!ifindex) { pr_debug("Interface %s no longer exists\n", iface->ifname); remove_all = true; ret = -ENODEV; } while ((de = readdir(dr)) != NULL) { DECLARE_LIBXDP_OPTS(xdp_program_opts, opts, 0); struct xdp_program *prog; if (!strcmp(".", de->d_name) || !strcmp("..", de->d_name)) continue; err = try_snprintf(pin_path, sizeof(pin_path), "%s/programs/%s/%s", pin_root_path, iface->ifname, de->d_name); if (err) goto out; if (remove_all) { err = unlink(pin_path); if (err) ret = err; continue; } opts.pin_path = pin_path; prog = xdp_program__create(&opts); if (libxdp_get_error(prog) || !(m = xdp_program__is_attached(prog, iface->ifindex))) { ret = libxdp_get_error(prog) ?: -ENOENT; pr_debug("Program %s no longer loaded on %s: %s\n", de->d_name, iface->ifname, strerror(-ret)); err = unlink(pin_path); if (err) ret = err; if (prog) xdp_program__close(prog); } else { if (strcmp(xdp_program__name(prog), de->d_name)) { pr_warn("Pinned and kernel prog names differ: %s/%s\n", xdp_program__name(prog), de->d_name); ret = -EFAULT; xdp_program__close(prog); } else { ret = 0; *xdp_prog = prog; if (mode) *mode = m; } break; } } out: closedir(dr); return ret; } int iterate_pinned_programs(const char *pin_root_path, program_callback cb, void *arg) { char pin_path[PATH_MAX]; struct dirent *de; int err = 0; DIR *dr; err = try_snprintf(pin_path, sizeof(pin_path), "%s/programs", pin_root_path); if (err) return err; dr = opendir(pin_path); if (!dr) return -ENOENT; while ((de = readdir(dr)) != NULL) { enum xdp_attach_mode mode = XDP_MODE_UNSPEC; struct xdp_program *prog = NULL; struct iface iface = {}; if (!strcmp(".", de->d_name) || !strcmp("..", de->d_name)) continue; iface.ifname = de->d_name; iface.ifindex = if_nametoindex(iface.ifname); err = try_snprintf(pin_path, sizeof(pin_path), "%s/programs/%s", pin_root_path, iface.ifname); if (err) goto out; err = get_pinned_program(&iface, pin_root_path, &mode, &prog); if (err == -ENOENT || err == -ENODEV) { err = rmdir(pin_path); if (err) goto out; continue; } else if (err) { goto out; } err = cb(&iface, prog, mode, arg); xdp_program__close(prog); if (err) goto out; } out: closedir(dr); return err; } int iterate_iface_multiprogs(multiprog_callback cb, void *arg) { struct if_nameindex *idx, *indexes = NULL; int err = 0; indexes = if_nameindex(); if (!indexes) { err = -errno; pr_warn("Couldn't get list of interfaces: %s\n", strerror(-err)); return err; } for (idx = indexes; idx->if_index; idx++) { struct xdp_multiprog *mp; struct iface iface = { .ifindex = idx->if_index, .ifname = idx->if_name, }; mp = xdp_multiprog__get_from_ifindex(iface.ifindex); if (IS_ERR_OR_NULL(mp)) { if (PTR_ERR(mp) != -ENOENT) { err = PTR_ERR(mp); pr_warn("Error getting XDP status for interface %s: %s\n", idx->if_name, strerror(-err)); goto out; } mp = NULL; } err = cb(&iface, mp, arg); xdp_multiprog__close(mp); if (err) goto out; } out: if_freenameindex(indexes); return err; } static bool bpf_is_valid_mntpt(const char *mnt, unsigned long magic) { struct statfs st_fs; if (statfs(mnt, &st_fs) < 0) return false; if ((unsigned long)st_fs.f_type != magic) return false; return true; } static const char *bpf_find_mntpt_single(unsigned long magic, char *mnt, int len, const char *mntpt) { if (bpf_is_valid_mntpt(mntpt, magic)) { strncpy(mnt, mntpt, len - 1); mnt[len - 1] = '\0'; return mnt; } return NULL; } static const char *bpf_find_mntpt(const char *fstype, unsigned long magic, char *mnt, int len, const char * const *known_mnts) { const char * const *ptr; char type[100]; FILE *fp; if (known_mnts) { ptr = known_mnts; while (*ptr) { if (bpf_find_mntpt_single(magic, mnt, len, *ptr)) return mnt; ptr++; } } if (len != PATH_MAX) return NULL; fp = fopen("/proc/mounts", "r"); if (fp == NULL) return NULL; while (fscanf(fp, "%*s %" textify(PATH_MAX) "s %99s %*s %*d %*d\n", mnt, type) == 2) { if (strcmp(type, fstype) == 0) break; } fclose(fp); if (strcmp(type, fstype) != 0) return NULL; return mnt; } static int bpf_mnt_check_target(const char *target) { int ret; ret = mkdir(target, S_IRWXU); if (ret && errno != EEXIST) { ret = -errno; pr_warn("mkdir %s failed: %s\n", target, strerror(-ret)); return ret; } return 0; } /* simplified version of code from iproute2 */ static const char *bpf_get_work_dir() { static char bpf_tmp[PATH_MAX] = BPF_DIR_MNT; static char bpf_wrk_dir[PATH_MAX]; static const char *mnt; static bool bpf_mnt_cached; static const char *const bpf_known_mnts[] = { BPF_DIR_MNT, "/bpf", 0, }; int ret; if (bpf_mnt_cached) return mnt; mnt = bpf_find_mntpt("bpf", BPF_FS_MAGIC, bpf_tmp, sizeof(bpf_tmp), bpf_known_mnts); if (!mnt) { mnt = BPF_DIR_MNT; ret = bpf_mnt_check_target(mnt); if (ret || !bpf_is_valid_mntpt(mnt, BPF_FS_MAGIC)) { mnt = NULL; goto out; } } strncpy(bpf_wrk_dir, mnt, sizeof(bpf_wrk_dir)); bpf_wrk_dir[sizeof(bpf_wrk_dir) - 1] = '\0'; mnt = bpf_wrk_dir; out: bpf_mnt_cached = true; return mnt; } int get_bpf_root_dir(char *buf, size_t buf_len, const char *subdir, bool fatal) { const char *bpf_dir; bpf_dir = bpf_get_work_dir(); if (!bpf_dir) { logging_print(fatal ? LOG_WARN : LOG_DEBUG, "Could not find BPF working dir - bpffs not mounted?\n"); return -ENOENT; } if (subdir) return try_snprintf(buf, buf_len, "%s/%s", bpf_dir, subdir); else return try_snprintf(buf, buf_len, "%s", bpf_dir); } int get_pinned_map_fd(const char *bpf_root, const char *map_name, struct bpf_map_info *info) { __u32 info_len = sizeof(*info); char buf[PATH_MAX]; int err; err = try_snprintf(buf, sizeof(buf), "%s/%s", bpf_root, map_name); if (err) return err; pr_debug("Getting pinned object from %s\n", buf); return get_pinned_object_fd(buf, info, &info_len); } int unlink_pinned_map(int dir_fd, const char *map_name) { struct stat statbuf = {}; int err; err = fstatat(dir_fd, map_name, &statbuf, 0); if (err && errno == ENOENT) { pr_debug("Map name %s not pinned\n", map_name); return 0; } else if (err) { err = -errno; pr_warn("Couldn't stat pinned map %s: %s\n", map_name, strerror(-err)); return err; } pr_debug("Unlinking pinned map %s\n", map_name); err = unlinkat(dir_fd, map_name, 0); if (err) { err = -errno; pr_warn("Couldn't unlink pinned map %s: %s\n", map_name, strerror(-err)); return -errno; } return 0; } #define XDP_UNKNOWN (XDP_REDIRECT + 1) #ifndef XDP_ACTION_MAX #define XDP_ACTION_MAX (XDP_UNKNOWN + 1) #endif static const char *xdp_action_names[XDP_ACTION_MAX] = { [XDP_ABORTED] = "XDP_ABORTED", [XDP_DROP] = "XDP_DROP", [XDP_PASS] = "XDP_PASS", [XDP_TX] = "XDP_TX", [XDP_REDIRECT] = "XDP_REDIRECT", [XDP_UNKNOWN] = "XDP_UNKNOWN", }; const char *action2str(__u32 action) { if (action < XDP_ACTION_MAX) return xdp_action_names[action]; return NULL; } int check_bpf_environ(void) { init_lib_logging(); if (geteuid() != 0) { pr_warn("This program must be run as root.\n"); return 1; } /* Try to avoid probing errors due to rlimit exhaustion by starting out * with an rlimit of 1 MiB. This is not going to solve all issues, but * it will at least make things work when there is nothing else loaded. * * Ignore return code because an error shouldn't abort running. */ set_rlimit(1024 * 1024); return 0; } int prog_lock_acquire(const char *dir) { int lock_fd, err = 0; retry: lock_fd = open(dir, O_DIRECTORY); if (lock_fd < 0) { if (errno == ENOENT && !mkdir(dir, S_IRWXU)) goto retry; err = -errno; pr_warn("Couldn't open lock directory at %s: %s\n", dir, strerror(-err)); return err; } err = flock(lock_fd, LOCK_EX); if (err) { err = -errno; pr_warn("Couldn't flock fd %d: %s\n", lock_fd, strerror(-err)); close(lock_fd); return err; } pr_debug("Acquired lock from %s with fd %d\n", dir, lock_fd); return lock_fd; } int prog_lock_release(int lock_fd) { int err; err = flock(lock_fd, LOCK_UN); if (err) { err = -errno; pr_warn("Couldn't unlock fd %d: %s\n", lock_fd, strerror(-err)); } else { pr_debug("Released lock fd %d\n", lock_fd); } close(lock_fd); return err; } static char *print_bpf_tag(char buf[BPF_TAG_SIZE * 2 + 1], const unsigned char tag[BPF_TAG_SIZE]) { int i; for (i = 0; i < BPF_TAG_SIZE; i++) sprintf(&buf[i * 2], "%02x", tag[i]); buf[BPF_TAG_SIZE * 2] = '\0'; return buf; } static int print_iface_status(const struct iface *iface, const struct xdp_multiprog *mp, __unused void *arg) { struct xdp_program *prog, *dispatcher, *hw_prog; char tag[BPF_TAG_SIZE * 2 + 1]; char buf[STRERR_BUFSIZE]; int err; if (!mp) { printf("%-22s \n", iface->ifname); return 0; } hw_prog = xdp_multiprog__hw_prog(mp); if (hw_prog) { printf("%-16s %-5s %-17s %-8s %-4d %-17s\n", iface->ifname, "", xdp_program__name(hw_prog), get_enum_name(xdp_modes, XDP_MODE_HW), xdp_program__id(hw_prog), print_bpf_tag(tag, xdp_program__tag(hw_prog))); } dispatcher = xdp_multiprog__main_prog(mp); if (dispatcher) { printf("%-16s %-5s %-17s %-8s %-4d %-17s\n", iface->ifname, "", xdp_program__name(dispatcher), get_enum_name(xdp_modes, xdp_multiprog__attach_mode(mp)), xdp_program__id(dispatcher), print_bpf_tag(tag, xdp_program__tag(dispatcher))); for (prog = xdp_multiprog__next_prog(NULL, mp); prog; prog = xdp_multiprog__next_prog(prog, mp)) { err = xdp_program__print_chain_call_actions(prog, buf, sizeof(buf)); if (err) return err; printf("%-16s %-5d %-16s %-8s %-4u %-17s %s\n", " =>", xdp_program__run_prio(prog), xdp_program__name(prog), "", xdp_program__id(prog), print_bpf_tag(tag, xdp_program__tag(prog)), buf); } } return 0; } int iface_print_status(const struct iface *iface) { int err = 0; printf("%-16s %-5s %-17s Mode ID %-17s %s\n", "Interface", "Prio", "Program name", "Tag", "Chain actions"); printf("--------------------------------------------------------------------------------------\n"); if (iface) { struct xdp_multiprog *mp; mp = xdp_multiprog__get_from_ifindex(iface->ifindex); if (IS_ERR_OR_NULL(mp)) { if (PTR_ERR(mp) != -ENOENT) { err = PTR_ERR(mp); pr_warn("Error getting XDP status for interface %s: %s\n", iface->ifname, strerror(-err)); goto out; } mp = NULL; } print_iface_status(iface, mp, NULL); } else { err = iterate_iface_multiprogs(print_iface_status, NULL); } printf("\n"); out: return err; } #ifdef HAVE_LIBBPF_BPF_XDP_QUERY int iface_get_xdp_feature_flags(int ifindex, __u64 *feature_flags) { LIBBPF_OPTS(bpf_xdp_query_opts, opts); int err; err = bpf_xdp_query(ifindex, 0, &opts); if (err) return err; *feature_flags = opts.feature_flags; return 0; } #else int iface_get_xdp_feature_flags(__unused int ifindex, __unused __u64 *feature_flags) { return -EOPNOTSUPP; } #endif xdp-tools-1.6.1/lib/util/util.h000066400000000000000000000062031514310632100163160ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UTIL_H #define __UTIL_H #include #include #include "params.h" #ifndef PATH_MAX #define PATH_MAX 4096 #endif #define STRERR_BUFSIZE 1024 #define _textify(x) #x #define textify(x) _textify(x) #define __unused __attribute__((unused)) #ifndef BPF_DIR_MNT #define BPF_DIR_MNT "/sys/fs/bpf" #endif #ifndef BPF_OBJECT_PATH #define BPF_OBJECT_PATH "/usr/lib/bpf" #endif #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) #define FOR_EACH_MAP_KEY(_err, _map_fd, _map_key, _prev_key) \ for (_err = bpf_map_get_next_key(_map_fd, NULL, &_map_key); \ !_err; \ _prev_key = _map_key, \ _err = bpf_map_get_next_key(_map_fd, &_prev_key, &_map_key)) #define min(x, y) ((x) < (y) ? x : y) #define max(x, y) ((x) > (y) ? x : y) #ifndef offsetof #define offsetof(type, member) ((size_t) & ((type *)0)->member) #endif #ifndef container_of #define container_of(ptr, type, member) \ ({ \ const typeof(((type *)0)->member) *__mptr = (ptr); \ (type *)((char *)__mptr - offsetof(type, member)); \ }) #endif #ifndef roundup #define roundup(x, y) \ ({ \ typeof(y) __y = y; \ (((x) + (__y - 1)) / __y) * __y; \ }) #endif int try_snprintf(char *buf, size_t buf_len, const char *format, ...); int make_dir_subdir(const char *parent, const char *dir); int check_bpf_environ(void); int double_rlimit(void); int attach_xdp_program(struct xdp_program *prog, const struct iface *iface, enum xdp_attach_mode mode, const char *pin_root_dir); int detach_xdp_program(struct xdp_program *prog, const struct iface *iface, enum xdp_attach_mode mode, const char *pin_root_dir); int find_bpf_file(char *buf, size_t buf_size, const char *progname); struct bpf_object *open_bpf_file(const char *progname, struct bpf_object_open_opts *opts); typedef int (*program_callback)(const struct iface *iface, struct xdp_program *prog, enum xdp_attach_mode mode, void *arg); typedef int (*multiprog_callback)(const struct iface *iface, const struct xdp_multiprog *mp, void *arg); int get_pinned_program(const struct iface *iface, const char *pin_root_path, enum xdp_attach_mode *mode, struct xdp_program **prog); int iterate_pinned_programs(const char *pin_root_path, program_callback cb, void *arg); int iterate_iface_multiprogs(multiprog_callback cb, void *arg); int get_bpf_root_dir(char *buf, size_t buf_len, const char *subdir, bool fatal); int get_pinned_map_fd(const char *bpf_root, const char *map_name, struct bpf_map_info *info); int unlink_pinned_map(int dir_fd, const char *map_name); const char *action2str(__u32 action); int prog_lock_acquire(const char *directory); int prog_lock_release(int lock_fd); const char *get_libbpf_version(void); int iface_print_status(const struct iface *iface); int iface_get_xdp_feature_flags(int ifindex, __u64 *feature_flags); #endif xdp-tools-1.6.1/lib/util/util.mk000066400000000000000000000003741514310632100165010ustar00rootroot00000000000000LIB_DIR ?= .. include $(LIB_DIR)/defines.mk UTIL_OBJS := params.o logging.o util.o stats.o xpcapng.o UTIL_BPF_OBJS := ifneq ($(BPFTOOL),) UTIL_OBJS += xdp_sample.o xdpsock.o UTIL_BPF_OBJS += xdp_sample.bpf.o xdp_load_bytes.bpf.o xdpsock.bpf.o endif xdp-tools-1.6.1/lib/util/xdp_load_bytes.bpf.c000066400000000000000000000007261514310632100211060ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 #include #include #ifndef HAVE_LIBBPF_BPF_PROGRAM__TYPE static long (*bpf_xdp_load_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) = (void *) 189; #endif SEC("xdp") int xdp_probe_prog(struct xdp_md *ctx) { __u8 buf[10]; int err; err = bpf_xdp_load_bytes(ctx, 0, buf, sizeof(buf)); if (err) return XDP_ABORTED; return XDP_PASS; } char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/lib/util/xdp_sample.bpf.c000066400000000000000000000015311514310632100202350ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 #include #include #include #include #include #ifndef HAVE_LIBBPF_BPF_PROGRAM__FLAGS /* bpf_trace_vprintk() appeared in the same libbpf version as bpf_program__flags() */ static long (*bpf_trace_vprintk)(const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 177; #endif SEC("tp_btf/xdp_cpumap_kthread") int BPF_PROG(tp_xdp_cpumap_kthread, int map_id, unsigned int processed, unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats) { static const char fmt[] = "Stats: %d %u %u %d %d\n"; unsigned long long args[] = { map_id, processed, drops, sched, xdp_stats->pass }; bpf_trace_vprintk(fmt, sizeof(fmt), args, sizeof(args)); return 0; } char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/lib/util/xdp_sample.c000066400000000000000000001220141514310632100174670ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0-only #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "xdp_sample.h" #include "logging.h" #include "xdp_sample.skel.h" #include "xdp_load_bytes.skel.h" #define __sample_print(fmt, cond, ...) \ ({ \ if (cond) \ printf(fmt, ##__VA_ARGS__); \ }) #define print_always(fmt, ...) __sample_print(fmt, 1, ##__VA_ARGS__) #define print_default(fmt, ...) \ __sample_print(fmt, sample_log_level & LL_DEFAULT, ##__VA_ARGS__) #define __print_err(err, fmt, ...) \ ({ \ __sample_print(fmt, err > 0 || sample_log_level & LL_DEFAULT, \ ##__VA_ARGS__); \ sample_err_exp = sample_err_exp ? true : err > 0; \ }) #define print_err(err, fmt, ...) __print_err(err, fmt, ##__VA_ARGS__) #define __COLUMN(x) "%'10" x " %-13s" #define FMT_COLUMNf __COLUMN(".0f") #define FMT_COLUMNd __COLUMN("d") #define FMT_COLUMNl __COLUMN(PRIu64) #define RX(rx) rx, "rx/s" #define PPS(pps) pps, "pkt/s" #define DROP(drop) drop, "drop/s" #define ERR(err) err, "error/s" #define HITS(hits) hits, "hit/s" #define XMIT(xmit) xmit, "xmit/s" #define PASS(pass) pass, "pass/s" #define REDIR(redir) redir, "redir/s" #define NANOSEC_PER_SEC 1000000000 /* 10^9 */ #define XDP_UNKNOWN (XDP_REDIRECT + 1) #define XDP_ACTION_MAX (XDP_UNKNOWN + 1) #define XDP_REDIRECT_ERR_MAX 7 enum map_type { MAP_RX, MAP_RXQ, MAP_REDIRECT_ERR, MAP_CPUMAP_ENQUEUE, MAP_CPUMAP_KTHREAD, MAP_EXCEPTION, MAP_DEVMAP_XMIT, MAP_DEVMAP_XMIT_MULTI, NUM_MAP, }; enum log_level { LL_DEFAULT = 1U << 0, LL_SIMPLE = 1U << 1, LL_DEBUG = 1U << 2, }; struct record { __u64 timestamp; struct datarec total; union { struct datarec *cpu; struct datarec *rxq; }; }; struct map_entry { struct hlist_node node; __u64 pair; struct record val; }; struct stats_record { struct record rx_cnt; struct record rxq_cnt; struct record redir_err[XDP_REDIRECT_ERR_MAX]; struct record kthread; struct record exception[XDP_ACTION_MAX]; struct record devmap_xmit; DECLARE_HASHTABLE(xmit_map, 5); struct record enq[]; }; struct sample_output { struct { uint64_t rx; uint64_t redir; uint64_t drop; uint64_t drop_xmit; uint64_t err; uint64_t err_pps; uint64_t xmit; } totals; struct { union { uint64_t pps; uint64_t num; }; uint64_t drop; uint64_t err; } rx_cnt; struct { uint64_t suc; uint64_t err; } redir_cnt; struct { uint64_t hits; } except_cnt; struct { uint64_t pps; uint64_t drop; uint64_t err; double bavg; } xmit_cnt; }; struct datarec *sample_mmap[NUM_MAP]; struct bpf_map *sample_map[NUM_MAP]; size_t sample_map_count[NUM_MAP]; enum log_level sample_log_level; struct sample_output sample_out; unsigned long sample_interval; __u64 sample_start_time; bool sample_err_exp; int sample_xdp_cnt; int sample_n_cpus; int sample_n_rxqs; int sample_sig_fd; int sample_mask; int ifindex[2]; static struct { bool checked; bool compat; } sample_compat[SAMPLE_COMPAT_MAX] = {}; bool sample_is_compat(enum sample_compat compat_value) { return sample_compat[compat_value].compat; } bool sample_probe_cpumap_compat(void) { struct xdp_sample *skel; bool res; skel = xdp_sample__open_and_load(); res = !!skel; xdp_sample__destroy(skel); return res; } bool sample_probe_xdp_load_bytes(void) { struct xdp_load_bytes *skel; bool res; skel = xdp_load_bytes__open_and_load(); res = !!skel; xdp_load_bytes__destroy(skel); return res; } void sample_check_cpumap_compat(struct bpf_program *prog, struct bpf_program *prog_compat) { bool res = sample_compat[SAMPLE_COMPAT_CPUMAP_KTHREAD].compat; if (!sample_compat[SAMPLE_COMPAT_CPUMAP_KTHREAD].checked) { res = sample_probe_cpumap_compat(); sample_compat[SAMPLE_COMPAT_CPUMAP_KTHREAD].checked = true; sample_compat[SAMPLE_COMPAT_CPUMAP_KTHREAD].compat = res; } if (res) { pr_debug("Kernel supports 5-arg xdp_cpumap_kthread tracepoint\n"); bpf_program__set_autoload(prog_compat, false); } else { pr_debug("Kernel does not support 5-arg xdp_cpumap_kthread tracepoint, using compat version\n"); bpf_program__set_autoload(prog, false); } } static const char *xdp_redirect_err_names[XDP_REDIRECT_ERR_MAX] = { /* Key=1 keeps unknown errors */ "Success", "Unknown", "EINVAL", "ENETDOWN", "EMSGSIZE", "EOPNOTSUPP", "ENOSPC", }; static const char *xdp_action_names[XDP_ACTION_MAX] = { [XDP_ABORTED] = "XDP_ABORTED", [XDP_DROP] = "XDP_DROP", [XDP_PASS] = "XDP_PASS", [XDP_TX] = "XDP_TX", [XDP_REDIRECT] = "XDP_REDIRECT", [XDP_UNKNOWN] = "XDP_UNKNOWN", }; static __u64 gettime(void) { struct timespec t; int res; res = clock_gettime(CLOCK_MONOTONIC, &t); if (res < 0) { pr_warn("Error with gettimeofday! (%i)\n", res); return UINT64_MAX; } return (__u64)t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; } static const char *xdp_action2str(int action) { if (action < XDP_ACTION_MAX) return xdp_action_names[action]; return NULL; } static struct datarec *alloc_records(int nr_entries) { struct datarec *array; if (nr_entries <= 0) return NULL; array = calloc(nr_entries, sizeof(*array)); if (!array) { pr_warn("Failed to allocate memory (nr_entries: %u)\n", nr_entries); return NULL; } return array; } static int map_entry_init(struct map_entry *e, __u64 pair) { e->pair = pair; INIT_HLIST_NODE(&e->node); e->val.timestamp = gettime(); e->val.cpu = alloc_records(libbpf_num_possible_cpus()); if (!e->val.cpu) return -ENOMEM; return 0; } static void map_collect_rxqs(struct datarec *values, struct record *rec) { int i; /* Get time as close as possible to reading map contents */ rec->timestamp = gettime(); /* Record and sum values from each RXQ */ for (i = 0; i < sample_n_rxqs; i++) { pr_debug("%d: %lx %lx\n", i, (unsigned long)&rec->rxq[i], (unsigned long)&values[i]); rec->rxq[i].processed = READ_ONCE(values[i].processed); rec->rxq[i].dropped = READ_ONCE(values[i].dropped); rec->rxq[i].issue = READ_ONCE(values[i].issue); rec->rxq[i].xdp_pass = READ_ONCE(values[i].xdp_pass); rec->rxq[i].xdp_drop = READ_ONCE(values[i].xdp_drop); rec->rxq[i].xdp_redirect = READ_ONCE(values[i].xdp_redirect); } } static void map_collect_percpu(struct datarec *values, struct record *rec) { /* For percpu maps, userspace gets a value per possible CPU */ int nr_cpus = libbpf_num_possible_cpus(); __u64 sum_xdp_redirect = 0; __u64 sum_processed = 0; __u64 sum_xdp_pass = 0; __u64 sum_xdp_drop = 0; __u64 sum_dropped = 0; __u64 sum_issue = 0; int i; /* Get time as close as possible to reading map contents */ rec->timestamp = gettime(); /* Record and sum values from each CPU */ for (i = 0; i < nr_cpus; i++) { rec->cpu[i].processed = READ_ONCE(values[i].processed); rec->cpu[i].dropped = READ_ONCE(values[i].dropped); rec->cpu[i].issue = READ_ONCE(values[i].issue); rec->cpu[i].xdp_pass = READ_ONCE(values[i].xdp_pass); rec->cpu[i].xdp_drop = READ_ONCE(values[i].xdp_drop); rec->cpu[i].xdp_redirect = READ_ONCE(values[i].xdp_redirect); sum_processed += rec->cpu[i].processed; sum_dropped += rec->cpu[i].dropped; sum_issue += rec->cpu[i].issue; sum_xdp_pass += rec->cpu[i].xdp_pass; sum_xdp_drop += rec->cpu[i].xdp_drop; sum_xdp_redirect += rec->cpu[i].xdp_redirect; } rec->total.processed = sum_processed; rec->total.dropped = sum_dropped; rec->total.issue = sum_issue; rec->total.xdp_pass = sum_xdp_pass; rec->total.xdp_drop = sum_xdp_drop; rec->total.xdp_redirect = sum_xdp_redirect; } static int map_collect_percpu_devmap(int map_fd, struct stats_record *rec) { int nr_cpus = libbpf_num_possible_cpus(); int i, ret, count = 32; struct datarec *values; bool init = false; __u32 batch; __u64 *keys; keys = calloc(count, sizeof(__u64)); if (!keys) return -ENOMEM; values = calloc(count * nr_cpus, sizeof(struct datarec)); if (!values) { free(keys); return -ENOMEM; } for (;;) { bool exit = false; ret = bpf_map_lookup_batch(map_fd, init ? &batch : NULL, &batch, keys, values, (__u32 *)&count, NULL); if (ret < 0 && errno != ENOENT) break; if (errno == ENOENT) exit = true; init = true; for (i = 0; i < count; i++) { struct map_entry *e, *x = NULL; __u64 pair = keys[i]; struct datarec *arr; arr = &values[i * nr_cpus]; hash_for_each_possible(rec->xmit_map, e, node, pair) { if (e->pair == pair) { x = e; break; } } if (!x) { x = calloc(1, sizeof(*x)); if (!x) goto cleanup; if (map_entry_init(x, pair) < 0) { free(x); goto cleanup; } hash_add(rec->xmit_map, &x->node, pair); } map_collect_percpu(arr, &x->val); } if (exit) break; count = 32; } free(values); free(keys); return 0; cleanup: free(values); free(keys); return -ENOMEM; } static struct stats_record *alloc_stats_record(void) { struct stats_record *rec; int i; rec = calloc(1, sizeof(*rec) + sample_n_cpus * sizeof(struct record)); if (!rec) { pr_warn("Failed to allocate memory\n"); return NULL; } if (sample_mask & SAMPLE_RX_CNT) { rec->rx_cnt.cpu = alloc_records(libbpf_num_possible_cpus()); if (!rec->rx_cnt.cpu) { pr_warn("Failed to allocate rx_cnt per-CPU array\n"); goto end_rec; } } if (sample_mask & SAMPLE_RXQ_STATS) { if (sample_n_rxqs <= 0) { pr_warn("Invalid number of RXQs: %d\n", sample_n_rxqs); goto end_rx_cnt; } rec->rxq_cnt.rxq = alloc_records(sample_n_rxqs); if (!rec->rxq_cnt.rxq) { pr_warn("Failed to allocate rxq_cnt per RXQ array\n"); goto end_rx_cnt; } } if (sample_mask & (SAMPLE_REDIRECT_CNT | SAMPLE_REDIRECT_ERR_CNT)) { for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) { rec->redir_err[i].cpu = alloc_records(libbpf_num_possible_cpus()); if (!rec->redir_err[i].cpu) { pr_warn("Failed to allocate redir_err per-CPU array for \"%s\" case\n", xdp_redirect_err_names[i]); while (i--) free(rec->redir_err[i].cpu); goto end_rxq_cnt; } } } if (sample_mask & SAMPLE_CPUMAP_KTHREAD_CNT) { rec->kthread.cpu = alloc_records(libbpf_num_possible_cpus()); if (!rec->kthread.cpu) { pr_warn("Failed to allocate kthread per-CPU array\n"); goto end_redir; } } if (sample_mask & SAMPLE_EXCEPTION_CNT) { for (i = 0; i < XDP_ACTION_MAX; i++) { rec->exception[i].cpu = alloc_records(libbpf_num_possible_cpus()); if (!rec->exception[i].cpu) { pr_warn("Failed to allocate exception per-CPU array for \"%s\" case\n", xdp_action2str(i)); while (i--) free(rec->exception[i].cpu); goto end_kthread; } } } if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT) { rec->devmap_xmit.cpu = alloc_records(libbpf_num_possible_cpus()); if (!rec->devmap_xmit.cpu) { pr_warn("Failed to allocate devmap_xmit per-CPU array\n"); goto end_exception; } } if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) hash_init(rec->xmit_map); if (sample_mask & SAMPLE_CPUMAP_ENQUEUE_CNT) { for (i = 0; i < sample_n_cpus; i++) { rec->enq[i].cpu = alloc_records(libbpf_num_possible_cpus()); if (!rec->enq[i].cpu) { pr_warn("Failed to allocate enqueue per-CPU array for CPU %d\n", i); while (i--) free(rec->enq[i].cpu); goto end_devmap_xmit; } } } return rec; end_devmap_xmit: free(rec->devmap_xmit.cpu); end_exception: for (i = 0; i < XDP_ACTION_MAX; i++) free(rec->exception[i].cpu); end_kthread: free(rec->kthread.cpu); end_redir: for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) free(rec->redir_err[i].cpu); end_rxq_cnt: free(rec->rxq_cnt.rxq); end_rx_cnt: free(rec->rx_cnt.cpu); end_rec: free(rec); return NULL; } static void free_stats_record(struct stats_record *r) { struct hlist_node *tmp; struct map_entry *e; unsigned int bkt; int i; for (i = 0; i < sample_n_cpus; i++) free(r->enq[i].cpu); hash_for_each_safe(r->xmit_map, bkt, tmp, e, node) { hash_del(&e->node); free(e->val.cpu); free(e); } free(r->devmap_xmit.cpu); for (i = 0; i < XDP_ACTION_MAX; i++) free(r->exception[i].cpu); free(r->kthread.cpu); for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) free(r->redir_err[i].cpu); free(r->rx_cnt.cpu); free(r); } static double calc_period(struct record *r, struct record *p) { double period_ = 0; __u64 period = 0; period = r->timestamp - p->timestamp; if (period > 0) period_ = ((double)period / NANOSEC_PER_SEC); return period_; } static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_) { __u64 packets = 0; __u64 pps = 0; if (period_ > 0) { packets = r->processed - p->processed; pps = round(packets / period_); } return pps; } static __u64 calc_pkts(struct datarec *r, struct datarec *p, double period_) { __u64 packets = 0; if (period_ > 0) { packets = r->processed - p->processed; } return packets; } static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_) { __u64 packets = 0; __u64 pps = 0; if (period_ > 0) { packets = r->dropped - p->dropped; pps = round(packets / period_); } return pps; } static __u64 calc_drop_pkts(struct datarec *r, struct datarec *p, double period_) { __u64 packets = 0; if (period_ > 0) { packets = r->dropped - p->dropped; } return packets; } static __u64 calc_errs_pps(struct datarec *r, struct datarec *p, double period_) { __u64 packets = 0; __u64 pps = 0; if (period_ > 0) { packets = r->issue - p->issue; pps = round(packets / period_); } return pps; } static __u64 calc_errs_pkts(struct datarec *r, struct datarec *p, double period_) { __u64 packets = 0; if (period_ > 0) { packets = r->issue - p->issue; } return packets; } static __u64 calc_info_pps(struct datarec *r, struct datarec *p, double period_) { __u64 packets = 0; __u64 pps = 0; if (period_ > 0) { packets = r->info - p->info; pps = round(packets / period_); } return pps; } static void calc_xdp_pps(struct datarec *r, struct datarec *p, double *xdp_pass, double *xdp_drop, double *xdp_redirect, double period_) { *xdp_pass = 0, *xdp_drop = 0, *xdp_redirect = 0; if (period_ > 0) { *xdp_redirect = (r->xdp_redirect - p->xdp_redirect) / period_; *xdp_pass = (r->xdp_pass - p->xdp_pass) / period_; *xdp_drop = (r->xdp_drop - p->xdp_drop) / period_; } } static void stats_get_rx_cnt(struct stats_record *stats_rec, struct stats_record *stats_prev, int nr_cpus, struct sample_output *out) { struct record *rec, *prev; double t, pps, drop, err; int i; rec = &stats_rec->rx_cnt; prev = &stats_prev->rx_cnt; t = calc_period(rec, prev); for (i = 0; i < nr_cpus; i++) { struct datarec *r = &rec->cpu[i]; struct datarec *p = &prev->cpu[i]; char str[64]; pps = calc_pps(r, p, t); drop = calc_drop_pps(r, p, t); err = calc_errs_pps(r, p, t); if (!pps && !drop && !err) continue; snprintf(str, sizeof(str), "cpu:%d", i); print_default(" %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n", str, PPS(pps), DROP(drop), ERR(err)); } if (out) { err = calc_errs_pps(&rec->total, &prev->total, t); out->rx_cnt.pps = calc_pps(&rec->total, &prev->total, t); out->rx_cnt.drop = calc_drop_pps(&rec->total, &prev->total, t); out->rx_cnt.err = err; out->totals.rx += calc_pkts(&rec->total, &prev->total, t); out->totals.drop += calc_drop_pkts(&rec->total, &prev->total, t); out->totals.err += calc_errs_pkts(&rec->total, &prev->total, t); out->totals.err_pps += err; } } static void stats_get_rxq_cnt(struct stats_record *stats_rec, struct stats_record *stats_prev) { struct record *rec, *prev; double t, pps, drop, err; int i; rec = &stats_rec->rxq_cnt; prev = &stats_prev->rxq_cnt; t = calc_period(rec, prev); print_default("\n"); for (i = 0; i < sample_n_rxqs; i++) { struct datarec *r = &rec->rxq[i]; struct datarec *p = &prev->rxq[i]; char str[64]; pps = calc_pps(r, p, t); drop = calc_drop_pps(r, p, t); err = calc_errs_pps(r, p, t); if (!pps && !drop && !err) continue; snprintf(str, sizeof(str), "rxq:%d", i); print_default(" %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n", str, PPS(pps), DROP(drop), ERR(err)); } } static void stats_get_cpumap_enqueue(struct stats_record *stats_rec, struct stats_record *stats_prev, int nr_cpus) { struct record *rec, *prev; double t, pps, drop, err; int i, to_cpu; /* cpumap enqueue stats */ for (to_cpu = 0; to_cpu < sample_n_cpus; to_cpu++) { rec = &stats_rec->enq[to_cpu]; prev = &stats_prev->enq[to_cpu]; t = calc_period(rec, prev); pps = calc_pps(&rec->total, &prev->total, t); drop = calc_drop_pps(&rec->total, &prev->total, t); err = calc_errs_pps(&rec->total, &prev->total, t); if (pps > 0 || drop > 0) { char str[64]; snprintf(str, sizeof(str), "enqueue to cpu %d", to_cpu); if (err > 0) err = pps / err; /* calc average bulk size */ print_err(drop, " %-20s " FMT_COLUMNf FMT_COLUMNf __COLUMN( ".2f") "\n", str, PPS(pps), DROP(drop), err, "bulk-avg"); } for (i = 0; i < nr_cpus; i++) { struct datarec *r = &rec->cpu[i]; struct datarec *p = &prev->cpu[i]; char str[64]; pps = calc_pps(r, p, t); drop = calc_drop_pps(r, p, t); err = calc_errs_pps(r, p, t); if (!pps && !drop && !err) continue; snprintf(str, sizeof(str), "cpu:%d->%d", i, to_cpu); if (err > 0) err = pps / err; /* calc average bulk size */ print_default( " %-18s " FMT_COLUMNf FMT_COLUMNf __COLUMN( ".2f") "\n", str, PPS(pps), DROP(drop), err, "bulk-avg"); } } } static void stats_get_cpumap_remote(struct stats_record *stats_rec, struct stats_record *stats_prev, int nr_cpus) { double xdp_pass, xdp_drop, xdp_redirect; struct record *rec, *prev; double t; int i; rec = &stats_rec->kthread; prev = &stats_prev->kthread; t = calc_period(rec, prev); calc_xdp_pps(&rec->total, &prev->total, &xdp_pass, &xdp_drop, &xdp_redirect, t); if (xdp_pass || xdp_drop || xdp_redirect) { print_err(xdp_drop, " %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n", "xdp_stats", PASS(xdp_pass), DROP(xdp_drop), REDIR(xdp_redirect)); } for (i = 0; i < nr_cpus; i++) { struct datarec *r = &rec->cpu[i]; struct datarec *p = &prev->cpu[i]; char str[64]; calc_xdp_pps(r, p, &xdp_pass, &xdp_drop, &xdp_redirect, t); if (!xdp_pass && !xdp_drop && !xdp_redirect) continue; snprintf(str, sizeof(str), "cpu:%d", i); print_default(" %-16s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n", str, PASS(xdp_pass), DROP(xdp_drop), REDIR(xdp_redirect)); } } static void stats_get_cpumap_kthread(struct stats_record *stats_rec, struct stats_record *stats_prev, int nr_cpus) { struct record *rec, *prev; double t, pps, drop, err; int i; rec = &stats_rec->kthread; prev = &stats_prev->kthread; t = calc_period(rec, prev); pps = calc_pps(&rec->total, &prev->total, t); drop = calc_drop_pps(&rec->total, &prev->total, t); err = calc_errs_pps(&rec->total, &prev->total, t); print_err(drop, " %-20s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n", pps ? "kthread total" : "kthread", PPS(pps), DROP(drop), err, "sched"); for (i = 0; i < nr_cpus; i++) { struct datarec *r = &rec->cpu[i]; struct datarec *p = &prev->cpu[i]; char str[64]; pps = calc_pps(r, p, t); drop = calc_drop_pps(r, p, t); err = calc_errs_pps(r, p, t); if (!pps && !drop && !err) continue; snprintf(str, sizeof(str), "cpu:%d", i); print_default(" %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n", str, PPS(pps), DROP(drop), err, "sched"); } } static void stats_get_redirect_cnt(struct stats_record *stats_rec, struct stats_record *stats_prev, int nr_cpus, struct sample_output *out) { struct record *rec, *prev; double t, pps; int i; rec = &stats_rec->redir_err[0]; prev = &stats_prev->redir_err[0]; t = calc_period(rec, prev); for (i = 0; i < nr_cpus; i++) { struct datarec *r = &rec->cpu[i]; struct datarec *p = &prev->cpu[i]; char str[64]; pps = calc_pps(r, p, t); if (!pps) continue; snprintf(str, sizeof(str), "cpu:%d", i); print_default(" %-18s " FMT_COLUMNf "\n", str, REDIR(pps)); } if (out) { out->redir_cnt.suc = calc_pps(&rec->total, &prev->total, t); out->totals.redir += calc_pkts(&rec->total, &prev->total, t); } } static void stats_get_redirect_err_cnt(struct stats_record *stats_rec, struct stats_record *stats_prev, int nr_cpus, struct sample_output *out) { double t, drop, sum_pps = 0, sum_pkts = 0; struct record *rec, *prev; int rec_i, i; for (rec_i = 1; rec_i < XDP_REDIRECT_ERR_MAX; rec_i++) { char str[64]; rec = &stats_rec->redir_err[rec_i]; prev = &stats_prev->redir_err[rec_i]; t = calc_period(rec, prev); drop = calc_drop_pps(&rec->total, &prev->total, t); if (drop > 0 && !out) { snprintf(str, sizeof(str), sample_log_level & LL_DEFAULT ? "%s total" : "%s", xdp_redirect_err_names[rec_i]); print_err(drop, " %-18s " FMT_COLUMNf "\n", str, ERR(drop)); } for (i = 0; i < nr_cpus; i++) { struct datarec *r = &rec->cpu[i]; struct datarec *p = &prev->cpu[i]; double drop; drop = calc_drop_pps(r, p, t); if (!drop) continue; snprintf(str, sizeof(str), "cpu:%d", i); print_default(" %-16s" FMT_COLUMNf "\n", str, ERR(drop)); } sum_pps += drop; sum_pkts += calc_drop_pkts(&rec->total, &prev->total, t); } if (out) { out->redir_cnt.err = sum_pps; out->totals.err += sum_pkts; out->totals.err_pps += sum_pps; } } static void stats_get_exception_cnt(struct stats_record *stats_rec, struct stats_record *stats_prev, int nr_cpus, struct sample_output *out) { double t, drop, sum_pps = 0, sum_pkts = 0; struct record *rec, *prev; int rec_i, i; for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) { rec = &stats_rec->exception[rec_i]; prev = &stats_prev->exception[rec_i]; t = calc_period(rec, prev); drop = calc_drop_pps(&rec->total, &prev->total, t); sum_pps += drop; sum_pkts += calc_drop_pkts(&rec->total, &prev->total, t); /* Fold out errors after heading */ if (drop > 0 && !out) { print_always(" %-18s " FMT_COLUMNf "\n", xdp_action2str(rec_i), ERR(drop)); for (i = 0; i < nr_cpus; i++) { struct datarec *r = &rec->cpu[i]; struct datarec *p = &prev->cpu[i]; char str[64]; double drop; drop = calc_drop_pps(r, p, t); if (!drop) continue; snprintf(str, sizeof(str), "cpu:%d", i); print_default(" %-16s" FMT_COLUMNf "\n", str, ERR(drop)); } } } if (out) { out->except_cnt.hits = sum_pps; out->totals.err += sum_pkts; out->totals.err_pps += sum_pps; } } static void stats_get_devmap_xmit(struct stats_record *stats_rec, struct stats_record *stats_prev, int nr_cpus, struct sample_output *out) { double pps, drop, info, err; struct record *rec, *prev; double t; int i; rec = &stats_rec->devmap_xmit; prev = &stats_prev->devmap_xmit; t = calc_period(rec, prev); for (i = 0; i < nr_cpus; i++) { struct datarec *r = &rec->cpu[i]; struct datarec *p = &prev->cpu[i]; char str[64]; pps = calc_pps(r, p, t); drop = calc_drop_pps(r, p, t); err = calc_errs_pps(r, p, t); if (!pps && !drop && !err) continue; snprintf(str, sizeof(str), "cpu:%d", i); info = calc_info_pps(r, p, t); if (info > 0) info = (pps + drop) / info; /* calc avg bulk */ print_default(" %-18s" FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf __COLUMN(".2f") "\n", str, XMIT(pps), DROP(drop), err, "drv_err/s", info, "bulk-avg"); } if (out) { pps = calc_pps(&rec->total, &prev->total, t); drop = calc_drop_pps(&rec->total, &prev->total, t); err = calc_errs_pps(&rec->total, &prev->total, t); info = calc_info_pps(&rec->total, &prev->total, t); if (info > 0) out->xmit_cnt.bavg = (pps + drop) / info; /* calc avg bulk */ out->xmit_cnt.pps = pps; out->xmit_cnt.drop = drop; out->xmit_cnt.err = err; out->totals.xmit += calc_pkts(&rec->total, &prev->total, t); out->totals.drop_xmit += calc_drop_pkts(&rec->total, &prev->total, t);; out->totals.err += calc_errs_pkts(&rec->total, &prev->total, t);; out->totals.err_pps += err; } } static void stats_get_devmap_xmit_multi(struct stats_record *stats_rec, struct stats_record *stats_prev, int nr_cpus, struct sample_output *out) { double pps, drop, info, err; struct map_entry *entry; struct record *r, *p; unsigned int bkt; double t; hash_for_each(stats_rec->xmit_map, bkt, entry, node) { struct map_entry *e, *x = NULL; char ifname_from[IFNAMSIZ]; char ifname_to[IFNAMSIZ]; const char *fstr, *tstr; unsigned long prev_time; struct record beg = {}; __u32 from_idx, to_idx; char str[128]; __u64 pair; int i; prev_time = sample_interval * NANOSEC_PER_SEC; pair = entry->pair; from_idx = pair >> 32; to_idx = pair & 0xFFFFFFFF; r = &entry->val; beg.timestamp = r->timestamp - prev_time; /* Find matching entry from stats_prev map */ hash_for_each_possible(stats_prev->xmit_map, e, node, pair) { if (e->pair == pair) { x = e; break; } } if (x) p = &x->val; else p = &beg; t = calc_period(r, p); pps = calc_pps(&r->total, &p->total, t); drop = calc_drop_pps(&r->total, &p->total, t); info = calc_info_pps(&r->total, &p->total, t); if (info > 0) info = (pps + drop) / info; /* calc avg bulk */ err = calc_errs_pps(&r->total, &p->total, t); if (out) { out->xmit_cnt.pps += pps; out->xmit_cnt.drop += drop; out->xmit_cnt.err += err; /* We are responsible for filling out totals */ out->totals.xmit += calc_pkts(&r->total, &p->total, t); out->totals.drop_xmit += calc_drop_pkts(&r->total, &p->total, t); out->totals.err += calc_errs_pkts(&r->total, &p->total, t); out->totals.err_pps += calc_errs_pps(&r->total, &p->total, t); continue; } fstr = tstr = NULL; if (if_indextoname(from_idx, ifname_from)) fstr = ifname_from; if (if_indextoname(to_idx, ifname_to)) tstr = ifname_to; snprintf(str, sizeof(str), "xmit %s->%s", fstr ?: "?", tstr ?: "?"); /* Skip idle streams of redirection */ if (pps || drop || err) { print_err(drop * !(sample_mask & SAMPLE_DROP_OK), " %-20s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf __COLUMN(".2f") "\n", str, XMIT(pps), DROP(drop), err, "drv_err/s", info, "bulk-avg"); } for (i = 0; i < nr_cpus; i++) { struct datarec *rc = &r->cpu[i]; struct datarec *pc, p_beg = {}; char str[64]; pc = p == &beg ? &p_beg : &p->cpu[i]; pps = calc_pps(rc, pc, t); drop = calc_drop_pps(rc, pc, t); err = calc_errs_pps(rc, pc, t); if (!pps && !drop && !err) continue; snprintf(str, sizeof(str), "cpu:%d", i); info = calc_info_pps(rc, pc, t); if (info > 0) info = (pps + drop) / info; /* calc avg bulk */ print_default(" %-18s" FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf __COLUMN(".2f") "\n", str, XMIT(pps), DROP(drop), err, "drv_err/s", info, "bulk-avg"); } } } static void stats_print(const char *prefix, int mask, struct stats_record *r, struct stats_record *p, struct sample_output *out) { int nr_cpus = libbpf_num_possible_cpus(); const char *str; print_always("%-23s", prefix ?: "Summary"); if (mask & SAMPLE_RX_CNT) print_always(FMT_COLUMNl, RX(out->rx_cnt.pps)); if (mask & SAMPLE_REDIRECT_CNT) print_always(FMT_COLUMNl, REDIR(out->redir_cnt.suc)); printf(FMT_COLUMNl, out->totals.err_pps + ((out->rx_cnt.drop + out->xmit_cnt.drop) * !(mask & SAMPLE_DROP_OK)), (mask & SAMPLE_DROP_OK) ? "err/s" : "err,drop/s"); if (mask & SAMPLE_DEVMAP_XMIT_CNT || mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) printf(FMT_COLUMNl, XMIT(out->xmit_cnt.pps)); printf("\n"); if (mask & SAMPLE_RX_CNT) { str = (sample_log_level & LL_DEFAULT) && out->rx_cnt.pps ? "receive total" : "receive"; print_err((out->rx_cnt.err || (out->rx_cnt.drop && !(mask & SAMPLE_DROP_OK))), " %-20s " FMT_COLUMNl FMT_COLUMNl FMT_COLUMNl "\n", str, PPS(out->rx_cnt.pps), DROP(out->rx_cnt.drop), ERR(out->rx_cnt.err)); stats_get_rx_cnt(r, p, nr_cpus, NULL); } if (mask & SAMPLE_RXQ_STATS) stats_get_rxq_cnt(r, p); if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT) stats_get_cpumap_enqueue(r, p, nr_cpus); if (mask & SAMPLE_CPUMAP_KTHREAD_CNT) { stats_get_cpumap_kthread(r, p, nr_cpus); stats_get_cpumap_remote(r, p, nr_cpus); } if (mask & SAMPLE_REDIRECT_CNT) { str = out->redir_cnt.suc ? "redirect total" : "redirect"; print_default(" %-20s " FMT_COLUMNl "\n", str, REDIR(out->redir_cnt.suc)); stats_get_redirect_cnt(r, p, nr_cpus, NULL); } if (mask & SAMPLE_REDIRECT_ERR_CNT) { str = (sample_log_level & LL_DEFAULT) && out->redir_cnt.err ? "redirect_err total" : "redirect_err"; print_err(out->redir_cnt.err, " %-20s " FMT_COLUMNl "\n", str, ERR(out->redir_cnt.err)); stats_get_redirect_err_cnt(r, p, nr_cpus, NULL); } if (mask & SAMPLE_EXCEPTION_CNT) { str = out->except_cnt.hits ? "xdp_exception total" : "xdp_exception"; print_err(out->except_cnt.hits, " %-20s " FMT_COLUMNl "\n", str, HITS(out->except_cnt.hits)); stats_get_exception_cnt(r, p, nr_cpus, NULL); } if (mask & SAMPLE_DEVMAP_XMIT_CNT) { str = (sample_log_level & LL_DEFAULT) && out->xmit_cnt.pps ? "devmap_xmit total" : "devmap_xmit"; print_err(out->xmit_cnt.err || out->xmit_cnt.drop, " %-20s " FMT_COLUMNl FMT_COLUMNl FMT_COLUMNl __COLUMN(".2f") "\n", str, XMIT(out->xmit_cnt.pps), DROP(out->xmit_cnt.drop), (uint64_t)out->xmit_cnt.err, "drv_err/s", out->xmit_cnt.bavg, "bulk-avg"); stats_get_devmap_xmit(r, p, nr_cpus, NULL); } if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) stats_get_devmap_xmit_multi(r, p, nr_cpus, NULL); if (sample_log_level & LL_DEFAULT || ((sample_log_level & LL_SIMPLE) && sample_err_exp)) { sample_err_exp = false; printf("\n"); } fflush(stdout); fflush(stderr); // Flushing both outputs to "bypass" buffering } static int get_num_rxqs(const char *ifname) { struct ethtool_channels ch = { .cmd = ETHTOOL_GCHANNELS, }; struct ifreq ifr = { .ifr_data = (void *)&ch, }; int fd, ret; if (!ifname || strlen(ifname) > sizeof(ifr.ifr_name) - 1) return 0; strcpy(ifr.ifr_name, ifname); fd = socket(AF_UNIX, SOCK_DGRAM, 0); if (fd < 0) { ret = -errno; pr_warn("Couldn't open socket socket: %s\n", strerror(-ret)); return ret; } ret = ioctl(fd, SIOCETHTOOL, &ifr); if (ret < 0) { ret = -errno; pr_debug("Error in ethtool ioctl: %s\n", strerror(-ret)); goto out; } ret = ch.rx_count + ch.combined_count; pr_debug("Got %d queues for ifname %s\n", ret, ifname); out: close(fd); return ret; } int sample_setup_maps(struct bpf_map **maps, const char *ifname) { sample_n_cpus = libbpf_num_possible_cpus(); for (int i = 0; i < MAP_DEVMAP_XMIT_MULTI; i++) { sample_map[i] = maps[i]; int n_cpus; switch (i) { case MAP_RX: case MAP_CPUMAP_KTHREAD: case MAP_DEVMAP_XMIT: sample_map_count[i] = sample_n_cpus; break; case MAP_RXQ: sample_n_rxqs = get_num_rxqs(ifname); sample_map_count[i] = sample_n_rxqs > 0 ? sample_n_rxqs : 1; break; case MAP_REDIRECT_ERR: sample_map_count[i] = XDP_REDIRECT_ERR_MAX * sample_n_cpus; break; case MAP_EXCEPTION: sample_map_count[i] = XDP_ACTION_MAX * sample_n_cpus; break; case MAP_CPUMAP_ENQUEUE: if (__builtin_mul_overflow(sample_n_cpus, sample_n_cpus, &n_cpus)) return -EOVERFLOW; sample_map_count[i] = n_cpus; break; default: return -EINVAL; } if (bpf_map__set_max_entries(sample_map[i], sample_map_count[i]) < 0) return -errno; } sample_map[MAP_DEVMAP_XMIT_MULTI] = maps[MAP_DEVMAP_XMIT_MULTI]; return 0; } static int sample_setup_maps_mappings(void) { for (int i = 0; i < MAP_DEVMAP_XMIT_MULTI; i++) { size_t size = sample_map_count[i] * sizeof(struct datarec); sample_mmap[i] = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, bpf_map__fd(sample_map[i]), 0); if (sample_mmap[i] == MAP_FAILED) return -errno; } return 0; } int __sample_init(int mask, int ifindex_from, int ifindex_to) { sigset_t st; if (mask & SAMPLE_RXQ_STATS && sample_n_rxqs <= 0) { pr_warn("Couldn't retrieve the number of RXQs, so can't enable RXQ stats\n"); return -EINVAL; } sigemptyset(&st); sigaddset(&st, SIGQUIT); sigaddset(&st, SIGINT); sigaddset(&st, SIGTERM); if (sigprocmask(SIG_BLOCK, &st, NULL) < 0) return -errno; sample_sig_fd = signalfd(-1, &st, SFD_CLOEXEC | SFD_NONBLOCK); if (sample_sig_fd < 0) return -errno; sample_mask = mask; ifindex[0] = ifindex_from; ifindex[1] = ifindex_to; return sample_setup_maps_mappings(); } static void sample_summary_print(void) { __u64 start = sample_start_time; __u64 now = gettime(); double dur_s = ((double)now - start) / NANOSEC_PER_SEC; print_always(" Duration : %.1fs\n", dur_s); if (sample_out.totals.rx) { double pkts = sample_out.totals.rx; print_always(" Packets received : %'-10" PRIu64 "\n", (uint64_t)sample_out.totals.rx); print_always(" Average packets/s : %'-10.0f\n", round(pkts / dur_s)); } if (sample_out.totals.redir) { double pkts = sample_out.totals.redir; print_always(" Packets redirected : %'-10" PRIu64 "\n", sample_out.totals.redir); print_always(" Average redir/s : %'-10.0f\n", round(pkts / dur_s)); } if (sample_out.totals.drop) print_always(" Rx dropped : %'-10" PRIu64 "\n", sample_out.totals.drop); if (sample_out.totals.drop_xmit) print_always(" Tx dropped : %'-10" PRIu64 "\n", sample_out.totals.drop_xmit); if (sample_out.totals.err) print_always(" Errors recorded : %'-10" PRIu64 "\n", sample_out.totals.err); if (sample_out.totals.xmit) { double pkts = sample_out.totals.xmit; print_always(" Packets transmitted : %'-10" PRIu64 "\n", sample_out.totals.xmit); print_always(" Average transmit/s : %'-10.0f\n", round(pkts / dur_s)); } } void sample_teardown(void) { size_t size; for (int i = 0; i < NUM_MAP; i++) { size = sample_map_count[i] * sizeof(**sample_mmap); munmap(sample_mmap[i], size); } sample_summary_print(); close(sample_sig_fd); } static int sample_stats_collect(struct stats_record *rec) { int i; if (sample_mask & SAMPLE_RX_CNT) map_collect_percpu(sample_mmap[MAP_RX], &rec->rx_cnt); if (sample_mask & SAMPLE_RXQ_STATS) map_collect_rxqs(sample_mmap[MAP_RXQ], &rec->rxq_cnt); if (sample_mask & SAMPLE_REDIRECT_CNT) map_collect_percpu(sample_mmap[MAP_REDIRECT_ERR], &rec->redir_err[0]); if (sample_mask & SAMPLE_REDIRECT_ERR_CNT) { for (i = 1; i < XDP_REDIRECT_ERR_MAX; i++) map_collect_percpu(&sample_mmap[MAP_REDIRECT_ERR][i * sample_n_cpus], &rec->redir_err[i]); } if (sample_mask & SAMPLE_CPUMAP_ENQUEUE_CNT) for (i = 0; i < sample_n_cpus; i++) map_collect_percpu(&sample_mmap[MAP_CPUMAP_ENQUEUE][i * sample_n_cpus], &rec->enq[i]); if (sample_mask & SAMPLE_CPUMAP_KTHREAD_CNT) map_collect_percpu(sample_mmap[MAP_CPUMAP_KTHREAD], &rec->kthread); if (sample_mask & SAMPLE_EXCEPTION_CNT) for (i = 0; i < XDP_ACTION_MAX; i++) map_collect_percpu(&sample_mmap[MAP_EXCEPTION][i * sample_n_cpus], &rec->exception[i]); if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT) map_collect_percpu(sample_mmap[MAP_DEVMAP_XMIT], &rec->devmap_xmit); if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) { if (map_collect_percpu_devmap(bpf_map__fd(sample_map[MAP_DEVMAP_XMIT_MULTI]), rec) < 0) return -EINVAL; } return 0; } static void sample_summary_update(struct sample_output *out) { sample_out.totals.rx += out->totals.rx; sample_out.totals.redir += out->totals.redir; sample_out.totals.drop += out->totals.drop; sample_out.totals.drop_xmit += out->totals.drop_xmit; sample_out.totals.err += out->totals.err; sample_out.totals.xmit += out->totals.xmit; } static void sample_stats_print(int mask, struct stats_record *cur, struct stats_record *prev, char *prog_name) { struct sample_output out = {}; if (mask & SAMPLE_RX_CNT) stats_get_rx_cnt(cur, prev, 0, &out); if (mask & SAMPLE_REDIRECT_CNT) stats_get_redirect_cnt(cur, prev, 0, &out); if (mask & SAMPLE_REDIRECT_ERR_CNT) stats_get_redirect_err_cnt(cur, prev, 0, &out); if (mask & SAMPLE_EXCEPTION_CNT) stats_get_exception_cnt(cur, prev, 0, &out); if (mask & SAMPLE_DEVMAP_XMIT_CNT) stats_get_devmap_xmit(cur, prev, 0, &out); else if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) stats_get_devmap_xmit_multi(cur, prev, 0, &out); sample_summary_update(&out); stats_print(prog_name, mask, cur, prev, &out); } void sample_switch_mode(void) { sample_log_level ^= LL_DEBUG - 1; } static int sample_signal_cb(void) { struct signalfd_siginfo si; int r; r = read(sample_sig_fd, &si, sizeof(si)); if (r < 0) return -errno; switch (si.ssi_signo) { case SIGQUIT: sample_switch_mode(); printf("\n"); break; default: printf("\n"); return 1; } return 0; } /* Pointer swap trick */ static void swap(struct stats_record **a, struct stats_record **b) { struct stats_record *tmp; tmp = *a; *a = *b; *b = tmp; } static int print_stats(struct stats_record **rec, struct stats_record **prev) { char line[64] = "Summary"; int ret; swap(prev, rec); ret = sample_stats_collect(*rec); if (ret < 0) return ret; if (ifindex[0] && !(sample_mask & SAMPLE_SKIP_HEADING)) { char fi[IFNAMSIZ]; char to[IFNAMSIZ]; const char *f, *t; f = t = NULL; if (if_indextoname(ifindex[0], fi)) f = fi; if (if_indextoname(ifindex[1], to)) t = to; snprintf(line, sizeof(line), "%s->%s", f ?: "?", t ?: "?"); } sample_stats_print(sample_mask, *rec, *prev, line); return 0; } static int sample_timer_cb(int timerfd, struct stats_record **rec, struct stats_record **prev) { int ret; __u64 t; ret = read(timerfd, &t, sizeof(t)); if (ret < 0) return -errno; return print_stats(rec, prev); } bool sample_immediate_exit(void) { const char *envval; envval = secure_getenv("XDP_SAMPLE_IMMEDIATE_EXIT"); if (envval && envval[0] == '1' && envval[1] == '\0') { pr_debug("XDP_SAMPLE_IMMEDIATE_EXIT envvar set, exiting immediately after setup\n"); return true; } return false; } int sample_run(unsigned int interval, void (*post_cb)(void *), void *ctx) { bool imm_exit = sample_immediate_exit(); struct timespec ts = { interval, 0 }; struct itimerspec its = { ts, ts }; struct stats_record *rec, *prev; struct pollfd pfd[2] = {}; int timerfd, ret; if (!interval) { pr_warn("Incorrect interval 0\n"); return -EINVAL; } sample_interval = interval; /* Pretty print numbers */ setlocale(LC_NUMERIC, "en_US.UTF-8"); timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC | TFD_NONBLOCK); if (timerfd < 0) return -errno; timerfd_settime(timerfd, 0, &its, NULL); pfd[0].fd = sample_sig_fd; pfd[0].events = POLLIN; pfd[1].fd = timerfd; pfd[1].events = POLLIN; ret = -ENOMEM; rec = alloc_stats_record(); if (!rec) goto end; prev = alloc_stats_record(); if (!prev) goto end_rec; sample_start_time = gettime(); ret = sample_stats_collect(rec); if (ret < 0) goto end_rec_prev; if (imm_exit) goto end_rec_prev; for (;;) { ret = poll(pfd, 2, -1); if (ret < 0) { if (errno == EINTR) continue; else break; } if (pfd[0].revents & POLLIN) { ret = sample_signal_cb(); if (ret) print_stats(&rec, &prev); } else if (pfd[1].revents & POLLIN) ret = sample_timer_cb(timerfd, &rec, &prev); if (ret) break; if (post_cb) post_cb(ctx); } end_rec_prev: free_stats_record(prev); end_rec: free_stats_record(rec); end: close(timerfd); return ret; } const char *get_driver_name(int ifindex) { struct ethtool_drvinfo drv = {}; char ifname[IF_NAMESIZE]; static char drvname[32]; struct ifreq ifr = {}; int fd, r = 0; fd = socket(AF_INET, SOCK_DGRAM, 0); if (fd < 0) return "[error]"; if (!if_indextoname(ifindex, ifname)) goto end; drv.cmd = ETHTOOL_GDRVINFO; safe_strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); ifr.ifr_data = (void *)&drv; r = ioctl(fd, SIOCETHTOOL, &ifr); if (r) goto end; safe_strncpy(drvname, drv.driver, sizeof(drvname)); close(fd); return drvname; end: r = errno; close(fd); return r == EOPNOTSUPP ? "loopback" : "[error]"; } int get_mac_addr(int ifindex, void *mac_addr) { char ifname[IF_NAMESIZE]; struct ifreq ifr = {}; int fd, r; fd = socket(AF_INET, SOCK_DGRAM, 0); if (fd < 0) return -errno; if (!if_indextoname(ifindex, ifname)) { r = -errno; goto end; } safe_strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); r = ioctl(fd, SIOCGIFHWADDR, &ifr); if (r) { r = -errno; goto end; } memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char)); end: close(fd); return r; } xdp-tools-1.6.1/lib/util/xdp_sample.h000066400000000000000000000116321514310632100174770ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0-only #ifndef XDP_SAMPLE_USER_H #define XDP_SAMPLE_USER_H #include #include #include #include "compat.h" enum stats_mask { _SAMPLE_REDIRECT_MAP = 1U << 0, SAMPLE_RX_CNT = 1U << 1, SAMPLE_REDIRECT_ERR_CNT = 1U << 2, SAMPLE_CPUMAP_ENQUEUE_CNT = 1U << 3, SAMPLE_CPUMAP_KTHREAD_CNT = 1U << 4, SAMPLE_EXCEPTION_CNT = 1U << 5, SAMPLE_DEVMAP_XMIT_CNT = 1U << 6, SAMPLE_REDIRECT_CNT = 1U << 7, SAMPLE_DEVMAP_XMIT_CNT_MULTI = 1U << 8, SAMPLE_SKIP_HEADING = 1U << 9, SAMPLE_RXQ_STATS = 1U << 10, SAMPLE_DROP_OK = 1U << 11, }; enum sample_compat { SAMPLE_COMPAT_CPUMAP_KTHREAD, __SAMPLE_COMPAT_MAX }; #define SAMPLE_COMPAT_MAX __SAMPLE_COMPAT_MAX /* Exit return codes */ #define EXIT_OK 0 #define EXIT_FAIL 1 #define EXIT_FAIL_OPTION 2 #define EXIT_FAIL_XDP 3 #define EXIT_FAIL_BPF 4 #define EXIT_FAIL_MEM 5 int sample_setup_maps(struct bpf_map **maps, const char *ifname); int __sample_init(int mask, int ifindex_from, int ifindex_to); void sample_teardown(void); int sample_run(unsigned int interval, void (*post_cb)(void *), void *ctx); bool sample_is_compat(enum sample_compat compat_value); bool sample_probe_cpumap_compat(void); bool sample_probe_xdp_load_bytes(void); bool sample_immediate_exit(void); void sample_check_cpumap_compat(struct bpf_program *prog, struct bpf_program *prog_compat); void sample_switch_mode(void); const char *get_driver_name(int ifindex); int get_mac_addr(int ifindex, void *mac_addr); #pragma GCC diagnostic push #if !defined(__clang__) && (__GNUC__ > 7) #pragma GCC diagnostic ignored "-Wstringop-truncation" #endif __attribute__((unused)) static inline char *safe_strncpy(char *dst, const char *src, size_t size) { if (!size) return dst; strncpy(dst, src, size - 1); dst[size - 1] = '\0'; return dst; } #pragma GCC diagnostic pop #define __attach_tp(name) \ ({ \ if (bpf_program__type(skel->progs.name) != BPF_PROG_TYPE_TRACING)\ return -EINVAL; \ skel->links.name = bpf_program__attach(skel->progs.name); \ if (!skel->links.name) \ return -errno; \ }) #define __attach_tp_compat(name, name_compat, _compat) \ ({ \ if (sample_is_compat(SAMPLE_COMPAT_ ## _compat)) \ __attach_tp(name); \ else \ __attach_tp(name_compat); \ }) #define sample_init_pre_load(skel, ifname) \ ({ \ skel->rodata->nr_cpus = libbpf_num_possible_cpus(); \ sample_check_cpumap_compat(skel->progs.tp_xdp_cpumap_kthread, \ skel->progs.tp_xdp_cpumap_compat); \ sample_setup_maps((struct bpf_map *[]){ \ skel->maps.rx_cnt, skel->maps.rxq_cnt, \ skel->maps.redir_err_cnt, \ skel->maps.cpumap_enqueue_cnt, \ skel->maps.cpumap_kthread_cnt, \ skel->maps.exception_cnt, skel->maps.devmap_xmit_cnt, \ skel->maps.devmap_xmit_cnt_multi}, ifname); \ }) #define DEFINE_SAMPLE_INIT(name) \ static int sample_init(struct name *skel, int sample_mask, \ int ifindex_from, int ifindex_to) \ { \ int ret; \ ret = __sample_init(sample_mask, ifindex_from, \ ifindex_to); \ if (ret < 0) \ return ret; \ if (sample_mask & SAMPLE_REDIRECT_CNT) \ __attach_tp(tp_xdp_redirect); \ if (sample_mask & SAMPLE_REDIRECT_ERR_CNT) \ __attach_tp(tp_xdp_redirect_err); \ if (sample_mask & SAMPLE_CPUMAP_ENQUEUE_CNT) \ __attach_tp(tp_xdp_cpumap_enqueue); \ if (sample_mask & SAMPLE_CPUMAP_KTHREAD_CNT) \ __attach_tp_compat(tp_xdp_cpumap_kthread, \ tp_xdp_cpumap_compat, \ CPUMAP_KTHREAD); \ if (sample_mask & SAMPLE_EXCEPTION_CNT) \ __attach_tp(tp_xdp_exception); \ if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT) \ __attach_tp(tp_xdp_devmap_xmit); \ if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) \ __attach_tp(tp_xdp_devmap_xmit_multi); \ return 0; \ } #endif xdp-tools-1.6.1/lib/util/xdpsock.bpf.c000066400000000000000000000011551514310632100175560ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 #include #include /* This XDP program is only needed for multi-buffer and XDP_SHARED_UMEM modes. * If you do not use these modes, libxdp can supply an XDP program for you. */ #define MAX_SOCKS 4 struct { __uint(type, BPF_MAP_TYPE_XSKMAP); __uint(max_entries, MAX_SOCKS); __uint(key_size, sizeof(int)); __uint(value_size, sizeof(int)); } xsks_map SEC(".maps"); int num_socks = 0; static unsigned int rr; SEC("xdp") int xdp_sock_prog(struct xdp_md *ctx) { rr = (rr + 1) & (num_socks - 1); return bpf_redirect_map(&xsks_map, rr, XDP_DROP); } xdp-tools-1.6.1/lib/util/xdpsock.c000066400000000000000000001416741514310632100170230ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2017 - 2022 Intel Corporation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "xdpsock.h" #include "xdp_sample.h" #include "logging.h" #include "util.h" #include "xdpsock.skel.h" #ifndef SOL_XDP #define SOL_XDP 283 #endif #ifndef AF_XDP #define AF_XDP 44 #endif #ifndef PF_XDP #define PF_XDP AF_XDP #endif #ifndef SO_PREFER_BUSY_POLL #define SO_PREFER_BUSY_POLL 69 #endif #ifndef SO_BUSY_POLL_BUDGET #define SO_BUSY_POLL_BUDGET 70 #endif #define NUM_FRAMES (4 * 1024UL) #define IS_EOP_DESC(options) (!((options) & XDP_PKT_CONTD)) #define DEBUG_HEXDUMP 0 #define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */ #define VLAN_PRIO_SHIFT 13 #define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ #define VLAN_VID__DEFAULT 1 #define VLAN_PRI__DEFAULT 0 #define NSEC_PER_SEC 1000000000UL #define NSEC_PER_USEC 1000 #define SCHED_PRI__DEFAULT 0 #define STRERR_BUFSIZE 1024 #define POLL_TIMEOUT 1000 struct vlan_ethhdr { unsigned char h_dest[6]; unsigned char h_source[6]; __be16 h_vlan_proto; __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; }; #define PKTGEN_MAGIC 0xbe9be955 struct pktgen_hdr { __be32 pgh_magic; __be32 seq_num; __be32 tv_sec; __be32 tv_usec; }; struct xsk_ring_stats { unsigned long rx_frags; unsigned long rx_npkts; unsigned long tx_frags; unsigned long tx_npkts; unsigned long rx_dropped_npkts; unsigned long rx_invalid_npkts; unsigned long tx_invalid_npkts; unsigned long rx_full_npkts; unsigned long rx_fill_empty_npkts; unsigned long tx_empty_npkts; unsigned long prev_rx_frags; unsigned long prev_rx_npkts; unsigned long prev_tx_frags; unsigned long prev_tx_npkts; unsigned long prev_rx_dropped_npkts; unsigned long prev_rx_invalid_npkts; unsigned long prev_tx_invalid_npkts; unsigned long prev_rx_full_npkts; unsigned long prev_rx_fill_empty_npkts; unsigned long prev_tx_empty_npkts; }; struct xsk_driver_stats { unsigned long intrs; unsigned long prev_intrs; }; struct xsk_app_stats { unsigned long rx_empty_polls; unsigned long fill_fail_polls; unsigned long copy_tx_sendtos; unsigned long tx_wakeup_sendtos; unsigned long opt_polls; unsigned long prev_rx_empty_polls; unsigned long prev_fill_fail_polls; unsigned long prev_copy_tx_sendtos; unsigned long prev_tx_wakeup_sendtos; unsigned long prev_opt_polls; }; struct xsk_umem_info { struct xsk_ring_prod fq; struct xsk_ring_cons cq; struct xsk_umem *umem; void *buffer; }; struct xsk_socket_info { struct xsk_ring_cons rx; struct xsk_ring_prod tx; struct xsk_umem_info *umem; struct xsk_socket *xsk; struct xsk_ring_stats ring_stats; struct xsk_app_stats app_stats; struct xsk_driver_stats drv_stats; __u32 outstanding_tx; bool copy_mode; }; static unsigned long get_nsecs(clockid_t clock) { struct timespec ts; int res; res = clock_gettime(clock, &ts); if (res < 0) { pr_warn("Error with gettimeofday! (%i)\n", res); return UINT64_MAX; } return ts.tv_sec * 1000000000UL + ts.tv_nsec; } static int xsk_get_xdp_stats(int fd, struct xsk_socket_info *xsk) { struct xdp_statistics stats; socklen_t optlen; int err; optlen = sizeof(stats); err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); if (err) return err; if (optlen == sizeof(struct xdp_statistics)) { xsk->ring_stats.rx_dropped_npkts = stats.rx_dropped; xsk->ring_stats.rx_invalid_npkts = stats.rx_invalid_descs; xsk->ring_stats.tx_invalid_npkts = stats.tx_invalid_descs; xsk->ring_stats.rx_full_npkts = stats.rx_ring_full; xsk->ring_stats.rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs; xsk->ring_stats.tx_empty_npkts = stats.tx_ring_empty_descs; return 0; } return -EINVAL; } static void dump_app_stats(const struct xsk_ctx *ctx, long dt, unsigned int i) { #define PPS(_now, _prev) ((_now - _prev) * 1000000000. / dt) char *fmt = " %-18s %'14.0f calls/s\n"; double rx_empty_polls_ps, fill_fail_polls_ps, copy_tx_sendtos_ps, tx_wakeup_sendtos_ps, opt_polls_ps; rx_empty_polls_ps = PPS(ctx->xsks[i]->app_stats.rx_empty_polls, ctx->xsks[i]->app_stats.prev_rx_empty_polls); fill_fail_polls_ps = PPS(ctx->xsks[i]->app_stats.fill_fail_polls, ctx->xsks[i]->app_stats.prev_fill_fail_polls); copy_tx_sendtos_ps = PPS(ctx->xsks[i]->app_stats.copy_tx_sendtos, ctx->xsks[i]->app_stats.prev_copy_tx_sendtos); tx_wakeup_sendtos_ps = PPS(ctx->xsks[i]->app_stats.tx_wakeup_sendtos, ctx->xsks[i]->app_stats.prev_tx_wakeup_sendtos); opt_polls_ps = PPS(ctx->xsks[i]->app_stats.opt_polls, ctx->xsks[i]->app_stats.prev_opt_polls); printf(fmt, "RX empty polls", rx_empty_polls_ps); printf(fmt, "Till fail polls", fill_fail_polls_ps); printf(fmt, "Copy tx sendtos", copy_tx_sendtos_ps); printf(fmt, "TX wakeup sendtos", tx_wakeup_sendtos_ps); printf(fmt, "Opt polls", opt_polls_ps); ctx->xsks[i]->app_stats.prev_rx_empty_polls = ctx->xsks[i]->app_stats.rx_empty_polls; ctx->xsks[i]->app_stats.prev_fill_fail_polls = ctx->xsks[i]->app_stats.fill_fail_polls; ctx->xsks[i]->app_stats.prev_copy_tx_sendtos = ctx->xsks[i]->app_stats.copy_tx_sendtos; ctx->xsks[i]->app_stats.prev_tx_wakeup_sendtos = ctx->xsks[i]->app_stats.tx_wakeup_sendtos; ctx->xsks[i]->app_stats.prev_opt_polls = ctx->xsks[i]->app_stats.opt_polls; #undef PPS } static bool get_interrupt_number(struct xsk_ctx *ctx, const char *irq_string) { FILE *f_int_proc; char line[4096]; bool found = false; f_int_proc = fopen("/proc/interrupts", "r"); if (f_int_proc == NULL) { printf("Failed to open /proc/interrupts.\n"); return found; } while (!feof(f_int_proc) && !found) { /* Make sure to read a full line at a time */ if (fgets(line, sizeof(line), f_int_proc) == NULL || line[strlen(line) - 1] != '\n') { printf("Error reading from interrupts file\n"); break; } /* Extract interrupt number from line */ if (strstr(line, irq_string) != NULL) { ctx->irq_no = atoi(line); found = true; break; } } fclose(f_int_proc); return found; } static int get_irqs(const struct xsk_ctx *ctx) { char count_path[PATH_MAX]; FILE *f_count_proc; char line[4096]; char *p = NULL; int ret; snprintf(count_path, sizeof(count_path), "/sys/kernel/irq/%i/per_cpu_count", ctx->irq_no); f_count_proc = fopen(count_path, "r"); if (f_count_proc == NULL) { ret = -errno; pr_warn("Failed to open %s: %s\n", count_path, strerror(-ret)); return ret; } if (fgets(line, sizeof(line), f_count_proc) == NULL || line[strlen(line) - 1] != '\n') { pr_warn("Error reading from %s\n", count_path); ret = -ENOENT; } else { static const char com[2] = ","; char *token; ret = 0; token = strtok_r(line, com, &p); while (token != NULL) { /* sum up interrupts across all cores */ ret += atoi(token); token = strtok_r(NULL, com, &p); } } fclose(f_count_proc); return ret; } static void dump_driver_stats(struct xsk_ctx *ctx, long dt) { #define PPS(_now, _prev) ((_now - _prev) * 1000000000. / dt) char *fmt = " %-18s %'14.0f intrs/s\n"; unsigned int i = 0; double intrs_ps; int n_ints; if (i >= ctx->num_socks || !ctx->xsks[i]) return; n_ints = get_irqs(ctx); if (n_ints < 0) { printf("error getting intr info for intr %i\n", ctx->irq_no); return; } ctx->xsks[i]->drv_stats.intrs = n_ints - ctx->irqs_at_init; intrs_ps = PPS(ctx->xsks[i]->drv_stats.intrs, ctx->xsks[i]->drv_stats.prev_intrs); printf(fmt, "IRQs", intrs_ps); ctx->xsks[i]->drv_stats.prev_intrs = ctx->xsks[i]->drv_stats.intrs; #undef PPS } static void dump_end_stats(struct xsk_ctx *ctx) { __u64 total_rx_f = 0, total_tx_f = 0, total_rx = 0, total_tx = 0; unsigned int i; for (i = 0; i < ctx->num_socks && ctx->xsks[i]; i++) { total_rx += ctx->xsks[i]->ring_stats.rx_npkts; total_tx += ctx->xsks[i]->ring_stats.tx_npkts; total_rx_f += ctx->xsks[i]->ring_stats.rx_frags; total_tx_f += ctx->xsks[i]->ring_stats.tx_frags; } printf("\nTotals:\n"); if (ctx->rx) { printf(" %-18s %'14" PRIu64 " pkts", "RX", (uint64_t)total_rx); if (ctx->opt.frags) printf(" %'14" PRIu64 " frags", (uint64_t)total_rx_f); printf("\n"); } if (ctx->tx) { printf(" %-18s %'14" PRIu64 " pkts", "TX", (uint64_t)total_tx); if (ctx->opt.frags) printf(" %'14" PRIu64 " frags", (uint64_t)total_tx_f); printf("\n"); } if (ctx->irq_no) printf(" %-18s %'14lu intrs", "IRQs", ctx->xsks[0]->drv_stats.intrs); for (i = 0; i < ctx->num_socks && ctx->xsks[i]; i++) { char *fmt = " %-18s %'14lu pkts\n"; printf("\n sock%d:\n", i); if (ctx->rx) { printf(" %-18s %'14lu pkts", "RX", ctx->xsks[i]->ring_stats.rx_npkts); if (ctx->opt.frags) printf(" %'14lu frags", ctx->xsks[i]->ring_stats.rx_frags); printf("\n"); } if (ctx->tx) { printf(" %-18s %'14lu pkts", "TX", ctx->xsks[i]->ring_stats.tx_npkts); if (ctx->opt.frags) printf(" %'14lu frags", ctx->xsks[i]->ring_stats.tx_frags); printf("\n"); } if (ctx->extra_stats) { printf("\n"); printf(fmt, "RX dropped", ctx->xsks[i]->ring_stats.rx_dropped_npkts); printf(fmt, "RX invalid", ctx->xsks[i]->ring_stats.rx_invalid_npkts); printf(fmt, "TX invalid", ctx->xsks[i]->ring_stats.tx_invalid_npkts); printf(fmt, "RX queue full", ctx->xsks[i]->ring_stats.rx_full_npkts); printf(fmt, "Fill ring empty", ctx->xsks[i]->ring_stats.rx_fill_empty_npkts); printf(fmt, "TX ring empty", ctx->xsks[i]->ring_stats.tx_empty_npkts); } if (ctx->opt.app_stats) { printf("\n"); char *fmt = " %-18s %'14lu calls\n"; printf(fmt, "RX empty polls", ctx->xsks[i]->app_stats.rx_empty_polls); printf(fmt, "Till fail polls", ctx->xsks[i]->app_stats.fill_fail_polls); printf(fmt, "Copy tx sendtos", ctx->xsks[i]->app_stats.copy_tx_sendtos); printf(fmt, "TX wakeup sendtos", ctx->xsks[i]->app_stats.tx_wakeup_sendtos); printf(fmt, "Opt polls", ctx->xsks[i]->app_stats.opt_polls); } } } static void dump_stats(struct xsk_ctx *ctx) { __u64 total_rx_f = 0, prev_total_rx_f = 0, total_tx_f = 0, prev_total_tx_f = 0; __u64 total_rx = 0, prev_total_rx = 0, total_tx = 0, prev_total_tx = 0; unsigned long now = get_nsecs(ctx->opt.clock); long dt = now - ctx->prev_time; unsigned int i; #define PPS(_now, _prev) ((_now - _prev) * 1000000000. / dt) ctx->prev_time = now; for (i = 0; i < ctx->num_socks && ctx->xsks[i]; i++) { total_rx += ctx->xsks[i]->ring_stats.rx_npkts; total_tx += ctx->xsks[i]->ring_stats.tx_npkts; total_rx_f += ctx->xsks[i]->ring_stats.rx_frags; total_tx_f += ctx->xsks[i]->ring_stats.tx_frags; prev_total_rx += ctx->xsks[i]->ring_stats.prev_rx_npkts; prev_total_tx += ctx->xsks[i]->ring_stats.prev_tx_npkts; prev_total_rx_f += ctx->xsks[i]->ring_stats.prev_rx_frags; prev_total_tx_f += ctx->xsks[i]->ring_stats.prev_tx_frags; } printf("%s:%d", ctx->opt.iface.ifname, ctx->opt.queue_idx); if (ctx->rx) { printf(" %'14.0f rx/s", PPS(total_rx, prev_total_rx)); if (ctx->opt.frags) printf(" %'14.0f rx frag/s", PPS(total_rx_f, prev_total_rx_f)); } if (ctx->tx) { printf(" %'14.0f xmit/s", PPS(total_tx, prev_total_tx)); if (ctx->opt.frags) printf(" %'14.0f xmit frag/s", PPS(total_tx_f, prev_total_tx_f)); } printf("\n"); if (ctx->irq_no) dump_driver_stats(ctx, dt); for (i = 0; i < ctx->num_socks && ctx->xsks[i]; i++) { char *fmt = " %-18s %'14.0f %-6s\n"; char *fmt_2 = " %-18s %'14.0f %-6s %'14.0f %-6s\n"; double rx_pps, rx_fps, tx_pps, tx_fps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps, tx_invalid_pps, tx_empty_pps; __u64 rx_npkts = ctx->xsks[i]->ring_stats.rx_npkts; __u64 rx_frags = ctx->xsks[i]->ring_stats.rx_frags; __u64 tx_npkts = ctx->xsks[i]->ring_stats.tx_npkts; __u64 tx_frags = ctx->xsks[i]->ring_stats.tx_frags; rx_fps = PPS(rx_frags, ctx->xsks[i]->ring_stats.prev_rx_frags); tx_fps = PPS(tx_frags, ctx->xsks[i]->ring_stats.prev_tx_frags); rx_pps = PPS(rx_npkts, ctx->xsks[i]->ring_stats.prev_rx_npkts); tx_pps = PPS(tx_npkts, ctx->xsks[i]->ring_stats.prev_tx_npkts); ctx->xsks[i]->ring_stats.prev_rx_frags = rx_frags; ctx->xsks[i]->ring_stats.prev_tx_frags = tx_frags; ctx->xsks[i]->ring_stats.prev_rx_npkts = rx_npkts; ctx->xsks[i]->ring_stats.prev_tx_npkts = tx_npkts; if (ctx->num_socks > 1 || ctx->extra_stats || ctx->opt.app_stats) { printf(" sock%-14d", i); if (ctx->rx) { printf(" %'14.0f rx/s ", rx_pps); if (ctx->opt.frags) printf(" %'14.0f rx frag/s", rx_fps); } if (ctx->tx) { printf(" %'14.0f xmit/s", tx_pps); if (ctx->opt.frags) printf(" %'14.0f xmit frag/s", tx_fps); } printf("\n"); } if (ctx->extra_stats) { if (!xsk_get_xdp_stats(xsk_socket__fd(ctx->xsks[i]->xsk), ctx->xsks[i])) { dropped_pps = PPS(ctx->xsks[i]->ring_stats.rx_dropped_npkts, ctx->xsks[i]->ring_stats.prev_rx_dropped_npkts); rx_invalid_pps = PPS(ctx->xsks[i]->ring_stats.rx_invalid_npkts, ctx->xsks[i]->ring_stats.prev_rx_invalid_npkts); tx_invalid_pps = PPS(ctx->xsks[i]->ring_stats.tx_invalid_npkts, ctx->xsks[i]->ring_stats.prev_tx_invalid_npkts); full_pps = PPS(ctx->xsks[i]->ring_stats.rx_full_npkts, ctx->xsks[i]->ring_stats.prev_rx_full_npkts); fill_empty_pps = PPS(ctx->xsks[i]->ring_stats.rx_fill_empty_npkts, ctx->xsks[i]->ring_stats.prev_rx_fill_empty_npkts); tx_empty_pps = PPS(ctx->xsks[i]->ring_stats.tx_empty_npkts, ctx->xsks[i]->ring_stats.prev_tx_empty_npkts); printf(fmt, "Dropped", dropped_pps, "pkt/s"); printf(fmt_2, "Invalid", rx_invalid_pps, "rx/s", tx_invalid_pps, "tx/s"); printf(fmt, "Queue full", full_pps, "rx/s"); printf(fmt, "Fill ring empty", fill_empty_pps, "pkt/s"); printf(fmt, "TX ring empty", tx_empty_pps, "pkt/s"); ctx->xsks[i]->ring_stats.prev_rx_dropped_npkts = ctx->xsks[i]->ring_stats.rx_dropped_npkts; ctx->xsks[i]->ring_stats.prev_rx_invalid_npkts = ctx->xsks[i]->ring_stats.rx_invalid_npkts; ctx->xsks[i]->ring_stats.prev_tx_invalid_npkts = ctx->xsks[i]->ring_stats.tx_invalid_npkts; ctx->xsks[i]->ring_stats.prev_rx_full_npkts = ctx->xsks[i]->ring_stats.rx_full_npkts; ctx->xsks[i]->ring_stats.prev_rx_fill_empty_npkts = ctx->xsks[i]->ring_stats.rx_fill_empty_npkts; ctx->xsks[i]->ring_stats.prev_tx_empty_npkts = ctx->xsks[i]->ring_stats.tx_empty_npkts; } else { printf("%-18s\n", "Error retrieving extra stats"); } } if (ctx->opt.app_stats) { printf("\n"); dump_app_stats(ctx, dt, i); } } #undef PPS if (ctx->opt.app_stats &&ctx->tx_cycle_ns) { printf(" %-18s period:%-10lu min:%-10lu ave:%-10lu max:%-10lu cycle:%-10lu\n", "Cyclic TX", ctx->tx_cycle_ns, ctx->tx_cycle_diff_min, (long)(ctx->tx_cycle_diff_ave / ctx->tx_cycle_cnt), ctx->tx_cycle_diff_max, ctx->tx_cycle_cnt); } } static bool is_benchmark_done(struct xsk_ctx *ctx) { if (ctx->duration > 0) { unsigned long dt = (get_nsecs(ctx->opt.clock) - ctx->start_time); if (dt >= ctx->duration) ctx->benchmark_done = true; } if (sample_immediate_exit()) ctx->benchmark_done = true; return ctx->benchmark_done; } static int signal_cb(struct xsk_ctx *ctx) { struct signalfd_siginfo si; int r; r = read(ctx->signal_fd, &si, sizeof(si)); if (r < 0) return -errno; switch (si.ssi_signo) { case SIGQUIT: ctx->extra_stats = !ctx->extra_stats; printf("\n"); break; default: printf("\n"); return 1; } return 0; } int xsk_stats_poller(struct xsk_ctx *ctx) { struct timespec ts = { ctx->opt.interval, 0 }; struct itimerspec its = { ts, ts }; struct pollfd pfd[2] = {}; int timerfd, ret = 0; __u64 t; setlocale(LC_NUMERIC, "en_US.UTF-8"); timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC | TFD_NONBLOCK); if (timerfd < 0) return -errno; if (timerfd_settime(timerfd, 0, &its, NULL)) { ret = -errno; goto out; } pfd[0].fd = ctx->signal_fd; pfd[0].events = POLLIN; pfd[1].fd = timerfd; pfd[1].events = POLLIN; while (!is_benchmark_done(ctx)) { ret = poll(pfd, 2, -1); if (ret < 0) { if (errno == EINTR) continue; else goto out; } if (pfd[0].revents & POLLIN) { ret = signal_cb(ctx); if (ret) { dump_stats(ctx); break; } } if (pfd[1].revents & POLLIN) { ret = read(timerfd, &t, sizeof(t)); if (ret < 0) { ret = -errno; goto out; } if (!ctx->opt.quiet) dump_stats(ctx); } } ret = 0; out: ctx->benchmark_done = true; close(timerfd); return ret; } void xsk_ctx__destroy(struct xsk_ctx *ctx) { struct xsk_umem *umem = ctx->xsks[0]->umem->umem; unsigned int i; dump_end_stats(ctx); for (i = 0; i < ctx->num_socks; i++) xsk_socket__delete(ctx->xsks[i]->xsk); (void)xsk_umem__delete(umem); if (ctx->xdp_prog) { xdp_program__detach(ctx->xdp_prog, ctx->opt.iface.ifindex, ctx->opt.attach_mode, 0); xdp_program__close(ctx->xdp_prog); } close(ctx->signal_fd); munmap(ctx->bufs, NUM_FRAMES * ctx->opt.frame_size); free(ctx); } static void swap_mac_addresses(void *data) { struct ether_header *eth = (struct ether_header *)data; struct ether_addr *src_addr = (struct ether_addr *)ð->ether_shost; struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; struct ether_addr tmp; tmp = *src_addr; *src_addr = *dst_addr; *dst_addr = tmp; } static void hex_dump(void *pkt, size_t length, __u64 addr) { const unsigned char *address = (unsigned char *)pkt; const unsigned char *line = address; size_t line_size = 32; unsigned char c; char buf[32]; int i = 0; if (!DEBUG_HEXDUMP) return; sprintf(buf, "addr=%" PRIu64, (uint64_t)addr); printf("length = %zu\n", length); printf("%s | ", buf); while (length-- > 0) { printf("%02X ", *address++); if (!(++i % line_size) || (length == 0 && i % line_size)) { if (length == 0) { while (i++ % line_size) printf("__ "); } printf(" | "); /* right close */ while (line < address) { c = *line++; printf("%c", (c < 33 || c == 255) ? 0x2E : c); } printf("\n"); if (length > 0) printf("%s | ", buf); } } printf("\n"); } static void *memset32_htonl(void *dest, __u32 val, __u32 size) { __u32 *ptr = (__u32 *)dest; __u32 i; val = htonl(val); for (i = 0; i < (size & (~0x3)); i += 4) ptr[i >> 2] = val; for (; i < size; i++) ((char *)dest)[i] = ((char *)&val)[i & 3]; return dest; } /* * This function code has been taken from * Linux kernel lib/checksum.c */ static inline unsigned short from32to16(unsigned int x) { /* add up 16-bit and 16-bit for 16+c bit */ x = (x & 0xffff) + (x >> 16); /* add up carry.. */ x = (x & 0xffff) + (x >> 16); return x; } /* * This function code has been taken from * Linux kernel lib/checksum.c */ static unsigned int do_csum(const unsigned char *buff, int len) { unsigned int result = 0; int odd; if (len <= 0) goto out; odd = 1 & (unsigned long)buff; if (odd) { #ifdef __LITTLE_ENDIAN result += (*buff << 8); #else result = *buff; #endif len--; buff++; } if (len >= 2) { if (2 & (unsigned long)buff) { result += *(unsigned short *)buff; len -= 2; buff += 2; } if (len >= 4) { const unsigned char *end = buff + ((unsigned int)len & ~3); unsigned int carry = 0; do { unsigned int w = *(unsigned int *)buff; buff += 4; result += carry; result += w; carry = (w > result); } while (buff < end); result += carry; result = (result & 0xffff) + (result >> 16); } if (len & 2) { result += *(unsigned short *)buff; buff += 2; } } if (len & 1) #ifdef __LITTLE_ENDIAN result += *buff; #else result += (*buff << 8); #endif result = from32to16(result); if (odd) result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); out: return result; } /* * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. * This function code has been taken from * Linux kernel lib/checksum.c */ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) { return (__sum16)~do_csum(iph, ihl * 4); } /* * Fold a partial checksum * This function code has been taken from * Linux kernel include/asm-generic/checksum.h */ static inline __sum16 csum_fold(__wsum csum) { __u32 sum = (__u32)csum; sum = (sum & 0xffff) + (sum >> 16); sum = (sum & 0xffff) + (sum >> 16); return (__sum16)~sum; } /* * This function code has been taken from * Linux kernel lib/checksum.c */ static inline __u32 from64to32(__u64 x) { /* add up 32-bit and 32-bit for 32+c bit */ x = (x & 0xffffffff) + (x >> 32); /* add up carry.. */ x = (x & 0xffffffff) + (x >> 32); return (__u32)x; } __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum); /* * This function code has been taken from * Linux kernel lib/checksum.c */ __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { unsigned long long s = (__u32)sum; s += (__u32)saddr; s += (__u32)daddr; #ifdef __BIG_ENDIAN__ s += proto + len; #else s += ((unsigned long long)proto + len) << 8; #endif return (__wsum)from64to32(s); } /* * This function has been taken from * Linux kernel include/asm-generic/checksum.h */ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); } static inline __u16 udp_csum(__u32 saddr, __u32 daddr, __u32 len, __u8 proto, __u16 *udp_pkt) { __u32 csum = 0; __u32 cnt = 0; /* udp hdr and data */ for (; cnt < len; cnt += 2) csum += udp_pkt[cnt >> 1]; return csum_tcpudp_magic(saddr, daddr, len, proto, csum); } #define ETH_FCS_SIZE 4 #define ETH_HDR_SIZE(opt) ((opt)->vlan_tag ? sizeof(struct vlan_ethhdr) : \ sizeof(struct ethhdr)) #define PKTGEN_HDR_SIZE(opt) ((opt)->timestamp ? sizeof(struct pktgen_hdr) : 0) #define PKT_HDR_SIZE(opt) (ETH_HDR_SIZE(opt) + sizeof(struct iphdr) + \ sizeof(struct udphdr) + PKTGEN_HDR_SIZE(opt)) #define PKTGEN_HDR_OFFSET(opt) (ETH_HDR_SIZE(opt) + sizeof(struct iphdr) + \ sizeof(struct udphdr)) #define PKTGEN_SIZE_MIN(opt) (PKTGEN_HDR_OFFSET(opt) + sizeof(struct pktgen_hdr) + \ ETH_FCS_SIZE) #define PKT_SIZE(opt) ((opt)->tx_pkt_size - ETH_FCS_SIZE) #define IP_PKT_SIZE(opt) (PKT_SIZE(opt) - ETH_HDR_SIZE(opt)) #define UDP_PKT_SIZE(opt) (IP_PKT_SIZE(opt) - sizeof(struct iphdr)) #define UDP_PKT_DATA_SIZE(opt) (UDP_PKT_SIZE(opt) - \ (sizeof(struct udphdr) + PKTGEN_HDR_SIZE(opt))) static void gen_eth_hdr_data(struct xsk_ctx *ctx) { struct pktgen_hdr *pktgen_hdr; struct udphdr *udp_hdr; struct iphdr *ip_hdr; if (ctx->opt.vlan_tag) { struct vlan_ethhdr *veth_hdr = (struct vlan_ethhdr *)ctx->pkt_data; __u16 vlan_tci = 0; udp_hdr = (struct udphdr *)(ctx->pkt_data + sizeof(struct vlan_ethhdr) + sizeof(struct iphdr)); ip_hdr = (struct iphdr *)(ctx->pkt_data + sizeof(struct vlan_ethhdr)); pktgen_hdr = (struct pktgen_hdr *)(ctx->pkt_data + sizeof(struct vlan_ethhdr) + sizeof(struct iphdr) + sizeof(struct udphdr)); /* ethernet & VLAN header */ memcpy(veth_hdr->h_dest, &ctx->opt.dst_mac, ETH_ALEN); memcpy(veth_hdr->h_source, &ctx->opt.src_mac, ETH_ALEN); veth_hdr->h_vlan_proto = htons(ETH_P_8021Q); vlan_tci = ctx->opt.vlan_id & VLAN_VID_MASK; vlan_tci |= (ctx->opt.vlan_pri << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK; veth_hdr->h_vlan_TCI = htons(vlan_tci); veth_hdr->h_vlan_encapsulated_proto = htons(ETH_P_IP); } else { struct ethhdr *eth_hdr = (struct ethhdr *)ctx->pkt_data; udp_hdr = (struct udphdr *)(ctx->pkt_data + sizeof(struct ethhdr) + sizeof(struct iphdr)); ip_hdr = (struct iphdr *)(ctx->pkt_data + sizeof(struct ethhdr)); pktgen_hdr = (struct pktgen_hdr *)(ctx->pkt_data + sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct udphdr)); /* ethernet header */ memcpy(eth_hdr->h_dest, &ctx->opt.dst_mac, ETH_ALEN); memcpy(eth_hdr->h_source, &ctx->opt.src_mac, ETH_ALEN); eth_hdr->h_proto = htons(ETH_P_IP); } /* IP header */ ip_hdr->version = IPVERSION; ip_hdr->ihl = 0x5; /* 20 byte header */ ip_hdr->tos = 0x0; ip_hdr->tot_len = htons(IP_PKT_SIZE(&ctx->opt)); ip_hdr->id = 0; ip_hdr->frag_off = 0; ip_hdr->ttl = IPDEFTTL; ip_hdr->protocol = IPPROTO_UDP; ip_hdr->saddr = htonl(0x0a0a0a10); ip_hdr->daddr = htonl(0x0a0a0a20); /* IP header checksum */ ip_hdr->check = 0; ip_hdr->check = ip_fast_csum((const void *)ip_hdr, ip_hdr->ihl); /* UDP header */ udp_hdr->source = htons(0x1000); udp_hdr->dest = htons(0x1000); udp_hdr->len = htons(UDP_PKT_SIZE(&ctx->opt)); if (ctx->opt.timestamp) pktgen_hdr->pgh_magic = htonl(PKTGEN_MAGIC); /* UDP data */ memset32_htonl(ctx->pkt_data + PKT_HDR_SIZE(&ctx->opt), ctx->opt.pkt_fill_pattern, UDP_PKT_DATA_SIZE(&ctx->opt)); /* UDP header checksum */ udp_hdr->check = 0; udp_hdr->check = udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE(&ctx->opt), IPPROTO_UDP, (__u16 *)udp_hdr); } static void gen_eth_frames(struct xsk_ctx *ctx, struct xsk_umem_info *umem, __u32 frame_size) { __u32 copy_len = frame_size; __u32 len = 0; unsigned int i; for (i = 0; i < NUM_FRAMES; i++) { __u64 addr = (__u64)i * frame_size; if (!len) { len = PKT_SIZE(&ctx->opt); copy_len = frame_size; } if (len < frame_size) copy_len = len; memcpy(xsk_umem__get_data(umem->buffer, addr), ctx->pkt_data + PKT_SIZE(&ctx->opt) - len, copy_len); len -= copy_len; } } static struct xsk_umem_info *xsk_configure_umem(void *buffer, __u64 size, __u32 frame_size, __u32 umem_flags) { struct xsk_umem_info *umem; struct xsk_umem_config cfg = { /* We recommend that you set the fill ring size >= HW RX ring size + * AF_XDP RX ring size. Make sure you fill up the fill ring * with buffers at regular intervals, and you will with this setting * avoid allocation failures in the driver. These are usually quite * expensive since drivers have not been written to assume that * allocation failures are common. For regular sockets, kernel * allocated memory is used that only runs out in OOM situations * that should be rare. */ .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = frame_size, .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, .flags = umem_flags }; int ret; umem = calloc(1, sizeof(*umem)); if (!umem) return ERR_PTR(-errno); ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, &cfg); if (ret) { free(umem); return ERR_PTR(ret); } umem->buffer = buffer; return umem; } static int xsk_populate_fill_ring(struct xsk_umem_info *umem, __u32 frame_size) { int ret, i; __u32 idx; ret = xsk_ring_prod__reserve(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, &idx); if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS * 2) return -ret; for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS * 2; i++) *xsk_ring_prod__fill_addr(&umem->fq, idx++) = (__u64)i * frame_size; xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS * 2); return 0; } static struct xsk_socket_info *xsk_configure_socket(const struct xsk_opts *opt, struct xsk_umem_info *umem, bool rx, bool tx) { __u32 xdp_bind_flags = opt->no_need_wakeup ? 0 : XDP_USE_NEED_WAKEUP; struct xsk_socket_config cfg = {}; struct xsk_socket_info *xsk; struct xsk_ring_cons *rxr; struct xsk_ring_prod *txr; int ret; if (opt->attach_mode == XDP_MODE_SKB) xdp_bind_flags |= XDP_COPY; xdp_bind_flags |= opt->copy_mode; xsk = calloc(1, sizeof(*xsk)); if (!xsk) return ERR_PTR(-errno); xsk->umem = umem; xsk->copy_mode = !!(xdp_bind_flags & XDP_COPY); cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; if (opt->shared_umem || opt->frags) cfg.libxdp_flags = XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD; else cfg.libxdp_flags = 0; if (opt->attach_mode == XDP_MODE_SKB) cfg.xdp_flags = XDP_FLAGS_SKB_MODE; else cfg.xdp_flags = XDP_FLAGS_DRV_MODE; cfg.bind_flags = xdp_bind_flags; rxr = rx ? &xsk->rx : NULL; txr = tx ? &xsk->tx : NULL; ret = xsk_socket__create(&xsk->xsk, opt->iface.ifname, opt->queue_idx, umem->umem, rxr, txr, &cfg); if (ret) goto err; xsk->app_stats.rx_empty_polls = 0; xsk->app_stats.fill_fail_polls = 0; xsk->app_stats.copy_tx_sendtos = 0; xsk->app_stats.tx_wakeup_sendtos = 0; xsk->app_stats.opt_polls = 0; xsk->app_stats.prev_rx_empty_polls = 0; xsk->app_stats.prev_fill_fail_polls = 0; xsk->app_stats.prev_copy_tx_sendtos = 0; xsk->app_stats.prev_tx_wakeup_sendtos = 0; xsk->app_stats.prev_opt_polls = 0; return xsk; err: free(xsk); return ERR_PTR(ret); } static int kick_tx(struct xsk_socket_info *xsk) { int ret; ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || errno == EBUSY || errno == ENETDOWN) return 0; return -errno; } static inline int complete_tx_l2fwd(struct xsk_socket_info *xsk, __u32 batch_size, bool busy_poll) { struct xsk_umem_info *umem = xsk->umem; __u32 idx_cq = 0, idx_fq = 0; unsigned int rcvd; size_t ndescs; int ret; if (!xsk->outstanding_tx) return 0; /* In copy mode, Tx is driven by a syscall so we need to use e.g. sendto() to * really send the packets. In zero-copy mode we do not have to do this, since Tx * is driven by the NAPI loop. So as an optimization, we do not have to call * sendto() all the time in zero-copy mode for l2fwd. */ if (xsk->copy_mode) { xsk->app_stats.copy_tx_sendtos++; ret = kick_tx(xsk); if (ret) return ret; } ndescs = min(xsk->outstanding_tx, batch_size); /* re-add completed Tx buffers */ rcvd = xsk_ring_cons__peek(&umem->cq, ndescs, &idx_cq); if (rcvd > 0) { unsigned int i; int ret; ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); while (ret != (int)rcvd) { if (ret < 0) return ret; if (busy_poll || xsk_ring_prod__needs_wakeup(&umem->fq)) { xsk->app_stats.fill_fail_polls++; if (recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL) < 0) return -errno; } ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); } for (i = 0; i < rcvd; i++) *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = *xsk_ring_cons__comp_addr(&umem->cq, idx_cq++); xsk_ring_prod__submit(&xsk->umem->fq, rcvd); xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; } return 0; } static inline void complete_tx_only(struct xsk_socket_info *xsk, int batch_size, bool need_wakeup) { unsigned int rcvd; __u32 idx; if (!xsk->outstanding_tx) return; if (!need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) { xsk->app_stats.tx_wakeup_sendtos++; kick_tx(xsk); } rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); if (rcvd > 0) { xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; } } static int rx_drop(struct xsk_socket_info *xsk, __u32 batch_size, bool busy_poll) { unsigned int rcvd, i, eop_cnt = 0; __u32 idx_rx = 0, idx_fq = 0; int ret; rcvd = xsk_ring_cons__peek(&xsk->rx, batch_size, &idx_rx); if (!rcvd) { if (busy_poll || xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { xsk->app_stats.rx_empty_polls++; if (recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL) < 0) return -errno; } return 0; } ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); while ((unsigned int)ret != rcvd) { if (ret < 0) return ret; if (busy_poll || xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { xsk->app_stats.fill_fail_polls++; if (recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL) < 0) return -errno; } ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); } for (i = 0; i < rcvd; i++) { const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); __u64 addr = desc->addr; __u32 len = desc->len; __u64 orig = xsk_umem__extract_addr(addr); eop_cnt += IS_EOP_DESC(desc->options); addr = xsk_umem__add_offset_to_addr(addr); char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); hex_dump(pkt, len, addr); *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig; } xsk_ring_prod__submit(&xsk->umem->fq, rcvd); xsk_ring_cons__release(&xsk->rx, rcvd); xsk->ring_stats.rx_npkts += eop_cnt; xsk->ring_stats.rx_frags += rcvd; return 0; } static void *xsk_rx_drop_all(void *arg) { struct xsk_ctx *ctx = arg; struct pollfd fds[MAX_SOCKS] = {}; unsigned int i; int ret; for (i = 0; i < ctx->num_socks; i++) { fds[i].fd = xsk_socket__fd(ctx->xsks[i]->xsk); fds[i].events = POLLIN; } while (!ctx->benchmark_done) { if (ctx->opt.use_poll) { for (i = 0; i < ctx->num_socks; i++) ctx->xsks[i]->app_stats.opt_polls++; ret = poll(fds, ctx->num_socks, ctx->poll_timeout); if (ret <= 0) continue; } for (i = 0; i < ctx->num_socks; i++) rx_drop(ctx->xsks[i], ctx->opt.batch_size, ctx->opt.busy_poll); } return NULL; } static int tx_only(struct xsk_ctx *ctx, struct xsk_socket_info *xsk, __u32 *frame_nb, int batch_size, unsigned long tx_ns) { int i; __u32 idx, tv_sec = 0, tv_usec = 0; while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < (unsigned int)batch_size) { complete_tx_only(xsk, batch_size, !ctx->opt.no_need_wakeup); if (ctx->benchmark_done) return 0; } if (ctx->opt.timestamp) { tv_sec = (__u32)(tx_ns / NSEC_PER_SEC); tv_usec = (__u32)((tx_ns % NSEC_PER_SEC) / 1000); } for (i = 0; i < batch_size; ) { __u32 len = PKT_SIZE(&ctx->opt); do { struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); tx_desc->addr = (__u64)*frame_nb * ctx->opt.frame_size; if (len > ctx->opt.frame_size) { tx_desc->len = ctx->opt.frame_size; tx_desc->options = XDP_PKT_CONTD; } else { tx_desc->len = len; tx_desc->options = 0; xsk->ring_stats.tx_npkts++; } len -= tx_desc->len; *frame_nb = (*frame_nb + 1) % NUM_FRAMES; i++; if (ctx->opt.timestamp) { struct pktgen_hdr *pktgen_hdr; __u64 addr = tx_desc->addr; char *pkt; pkt = xsk_umem__get_data(xsk->umem->buffer, addr); pktgen_hdr = (struct pktgen_hdr *)(pkt + PKTGEN_HDR_OFFSET(&ctx->opt)); pktgen_hdr->seq_num = htonl(ctx->sequence++); pktgen_hdr->tv_sec = htonl(tv_sec); pktgen_hdr->tv_usec = htonl(tv_usec); hex_dump(pkt, PKT_SIZE(&ctx->opt), addr); } } while (len); } xsk_ring_prod__submit(&xsk->tx, batch_size); xsk->outstanding_tx += batch_size; xsk->ring_stats.tx_frags += batch_size; complete_tx_only(xsk, batch_size, !ctx->opt.no_need_wakeup); return batch_size / ctx->frames_per_pkt; } static inline int get_batch_size(const struct xsk_ctx *ctx, int pkt_cnt) { if (!ctx->opt.pkt_count) return ctx->opt.batch_size * ctx->frames_per_pkt; if (pkt_cnt + ctx->opt.batch_size <= ctx->opt.pkt_count) return ctx->opt.batch_size * ctx->frames_per_pkt; return (ctx->opt.pkt_count - pkt_cnt) * ctx->frames_per_pkt; } static void complete_tx_only_all(struct xsk_ctx *ctx) { bool pending; unsigned int i; do { pending = false; for (i = 0; i < ctx->num_socks; i++) { if (ctx->xsks[i]->outstanding_tx) { complete_tx_only(ctx->xsks[i], ctx->opt.batch_size, !ctx->opt.no_need_wakeup); pending = !!ctx->xsks[i]->outstanding_tx; } } sleep(1); } while (pending && ctx->retries-- > 0); } static void *xsk_tx_only_all(void *arg) { struct xsk_ctx *ctx = arg; struct pollfd fds[MAX_SOCKS] = {}; __u32 frame_nb[MAX_SOCKS] = {}; unsigned long next_tx_ns = 0; unsigned int pkt_cnt = 0, i; int ret; for (i = 0; i < ctx->num_socks; i++) { fds[0].fd = xsk_socket__fd(ctx->xsks[i]->xsk); fds[0].events = POLLOUT; } if (ctx->tx_cycle_ns) { /* Align Tx time to micro-second boundary */ next_tx_ns = (get_nsecs(ctx->opt.clock) / NSEC_PER_USEC + 1) * NSEC_PER_USEC; next_tx_ns += ctx->tx_cycle_ns; /* Initialize periodic Tx scheduling variance */ ctx->tx_cycle_diff_min = 1000000000; ctx->tx_cycle_diff_max = 0; ctx->tx_cycle_diff_ave = 0.0; } while (!ctx->benchmark_done && ((ctx->opt.pkt_count && pkt_cnt < ctx->opt.pkt_count) || !ctx->opt.pkt_count)) { int batch_size = get_batch_size(ctx, pkt_cnt); unsigned long tx_ns = 0; struct timespec next; int tx_cnt = 0; long diff; int err; if (ctx->opt.use_poll) { for (i = 0; i < ctx->num_socks; i++) ctx->xsks[i]->app_stats.opt_polls++; ret = poll(fds, ctx->num_socks, ctx->poll_timeout); if (ret <= 0) continue; if (!(fds[0].revents & POLLOUT)) continue; } if (ctx->tx_cycle_ns) { next.tv_sec = next_tx_ns / NSEC_PER_SEC; next.tv_nsec = next_tx_ns % NSEC_PER_SEC; err = clock_nanosleep(ctx->opt.clock, TIMER_ABSTIME, &next, NULL); if (err) { if (err != EINTR) pr_warn("clock_nanosleep failed. Err:%d errno:%d\n", err, errno); return ERR_PTR(err); } /* Measure periodic Tx scheduling variance */ tx_ns = get_nsecs(ctx->opt.clock); diff = tx_ns - next_tx_ns; if (diff < ctx->tx_cycle_diff_min) ctx->tx_cycle_diff_min = diff; if (diff > ctx->tx_cycle_diff_max) ctx->tx_cycle_diff_max = diff; ctx->tx_cycle_diff_ave += (double)diff; ctx->tx_cycle_cnt++; } else if (ctx->opt.timestamp) { tx_ns = get_nsecs(ctx->opt.clock); } for (i = 0; i < ctx->num_socks; i++) tx_cnt += tx_only(ctx, ctx->xsks[i], &frame_nb[i], batch_size, tx_ns); pkt_cnt += tx_cnt; if (ctx->tx_cycle_ns) next_tx_ns += ctx->tx_cycle_ns; } if (ctx->opt.pkt_count) complete_tx_only_all(ctx); return NULL; } static int l2fwd(struct xsk_ctx *ctx, struct xsk_socket_info *xsk) { __u32 idx_rx = 0, idx_tx = 0, frags_done = 0; unsigned int rcvd, i, eop_cnt = 0; static __u32 nb_frags; int ret; complete_tx_l2fwd(xsk, ctx->opt.batch_size, ctx->opt.busy_poll); rcvd = xsk_ring_cons__peek(&xsk->rx, ctx->opt.batch_size, &idx_rx); if (!rcvd) { if (ctx->opt.busy_poll || xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { xsk->app_stats.rx_empty_polls++; if (recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL) < 0) return -errno; } return 0; } ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); while ((unsigned int)ret != rcvd) { if (ret < 0) return ret; complete_tx_l2fwd(xsk, ctx->opt.batch_size, ctx->opt.busy_poll); if (ctx->opt.busy_poll || xsk_ring_prod__needs_wakeup(&xsk->tx)) { xsk->app_stats.tx_wakeup_sendtos++; ret = kick_tx(xsk); if (ret) return ret; } ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); } for (i = 0; i < rcvd; i++) { const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); bool eop = IS_EOP_DESC(desc->options); __u64 addr = desc->addr; __u32 len = desc->len; __u64 orig = addr; addr = xsk_umem__add_offset_to_addr(addr); char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); if (!nb_frags++) swap_mac_addresses(pkt); hex_dump(pkt, len, addr); struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++); tx_desc->options = eop ? 0 : XDP_PKT_CONTD; tx_desc->addr = orig; tx_desc->len = len; if (eop) { frags_done += nb_frags; nb_frags = 0; eop_cnt++; } } xsk_ring_prod__submit(&xsk->tx, frags_done); xsk_ring_cons__release(&xsk->rx, frags_done); xsk->ring_stats.rx_npkts += eop_cnt; xsk->ring_stats.tx_npkts += eop_cnt; xsk->ring_stats.rx_frags += rcvd; xsk->ring_stats.tx_frags += rcvd; xsk->outstanding_tx += frags_done; return 0; } void *xsk_l2fwd_all(void *arg) { struct xsk_ctx *ctx = arg; struct pollfd fds[MAX_SOCKS] = {}; unsigned int i; int ret; while (!ctx->benchmark_done) { if (ctx->opt.use_poll) { for (i = 0; i < ctx->num_socks; i++) { fds[i].fd = xsk_socket__fd(ctx->xsks[i]->xsk); fds[i].events = POLLOUT | POLLIN; ctx->xsks[i]->app_stats.opt_polls++; } ret = poll(fds, ctx->num_socks, ctx->poll_timeout); if (ret <= 0) continue; } for (i = 0; i < ctx->num_socks; i++) l2fwd(ctx, ctx->xsks[i]); } return NULL; } static struct xdp_program *load_xdp_program(struct xsk_ctx *ctx, const struct xsk_opts *opt, bool populate_map) { DECLARE_LIBBPF_OPTS(xdp_program_opts, opts, .prog_name = "xdp_sock_prog"); struct xdp_program *xdp_prog = NULL, *ret_prog; char errmsg[STRERR_BUFSIZE]; struct xdpsock *skel; unsigned int i; int err; skel = xdpsock__open(); if (!skel) { err = -errno; pr_warn("Failed to load skeleton: %s\n", strerror(-err)); goto err; } opts.obj = skel->obj; xdp_prog = xdp_program__create(&opts); if (!xdp_prog) { err = -errno; pr_warn("Failed to create XDP program: %s\n", strerror(-err)); goto err; } /* we can't set this from the program section because libbpf won't let * us turn it back off if we do. So set it here to allow the automatic * logic for turning off the flag in libxdp to work */ xdp_program__set_xdp_frags_support(xdp_prog, true); err = xdp_program__attach(xdp_prog, opt->iface.ifindex, opt->attach_mode, 0); if (err) { libxdp_strerror(err, errmsg, sizeof(errmsg)); pr_warn("ERROR: attaching program failed: %s\n", errmsg); goto err; } if (populate_map) { skel->bss->num_socks = ctx->num_socks; for (i = 0; i < ctx->num_socks; i++) { int fd = xsk_socket__fd(ctx->xsks[i]->xsk); int key = i; err = bpf_map_update_elem( bpf_map__fd(skel->maps.xsks_map), &key, &fd, 0); if (err) { pr_warn("ERROR: bpf_map_update_elem %d\n", i); goto err; } } } /* Clone the xdp_prog before returning to avoid having a dangling * reference to the skeleton. */ ret_prog = xdp_program__clone(xdp_prog, 0); if (!ret_prog) { err = -errno; pr_warn("Couldn't clone xdp_program: %s\n", strerror(-err)); goto err; } xdp_program__close(xdp_prog); xdpsock__destroy(skel); return ret_prog; err: xdp_program__close(xdp_prog); xdpsock__destroy(skel); return ERR_PTR(err); } static int apply_busy_poll_opts(struct xsk_socket *xsk, __u32 batch_size) { int sock_opt; sock_opt = 1; if (setsockopt(xsk_socket__fd(xsk), SOL_SOCKET, SO_PREFER_BUSY_POLL, (void *)&sock_opt, sizeof(sock_opt)) < 0) return -errno; sock_opt = 20; if (setsockopt(xsk_socket__fd(xsk), SOL_SOCKET, SO_BUSY_POLL, (void *)&sock_opt, sizeof(sock_opt)) < 0) return -errno; sock_opt = batch_size; if (setsockopt(xsk_socket__fd(xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, (void *)&sock_opt, sizeof(sock_opt)) < 0) return -errno; return 0; } bool xsk_probe_busy_poll(void) { struct xsk_socket_config cfg = { .libxdp_flags = XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD, .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, }; unsigned int mmap_flags = 0, umem_flags = 0; __u32 frame_size = 4096, batch_size = 64; struct xsk_umem_info *umem = NULL; struct xsk_socket *xsk = NULL; struct xsk_ring_cons rx; void *bufs; int ret; bufs = mmap(NULL, NUM_FRAMES * frame_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | mmap_flags, -1, 0); if (bufs == MAP_FAILED) { ret = -errno; pr_debug("Failed to mmap: %d\n", ret); goto out; } umem = xsk_configure_umem(bufs, NUM_FRAMES * frame_size, frame_size, umem_flags); if (IS_ERR(umem)) { ret = PTR_ERR(umem); pr_debug("Failed to configure umem: %d\n", ret); umem = NULL; goto out; } ret = xsk_socket__create(&xsk, "lo", 0, umem->umem, &rx, NULL, &cfg); if (ret) { pr_debug("Failed to create socket: %d\n", ret); goto out; } ret = apply_busy_poll_opts(xsk, batch_size); pr_debug("Apply busy poll opts returned %d\n", ret); out: xsk_socket__delete(xsk); if (umem) { xsk_umem__delete(umem->umem); free(umem); } munmap(bufs, NUM_FRAMES * frame_size); return !ret; } static int xsk_set_sched_priority(enum xsk_sched_policy sched_policy, unsigned int sched_prio) { struct sched_param schparam = { .sched_priority = sched_prio, }; int ret; /* Configure sched priority for better wake-up accuracy */ ret = sched_setscheduler(0, sched_policy, &schparam); if (ret) pr_warn("Error(%d) in setting priority(%d): %s\n", errno, sched_prio, strerror(errno)); return ret; } struct xsk_ctx *xsk_ctx__create(const struct xsk_opts *opt, enum xsk_benchmark_type bench) { unsigned int mmap_flags = 0, umem_flags = 0, num_xsks = 1, i; struct xsk_umem_info *umem = NULL; bool rx = false, tx = false; struct xsk_ctx *ctx; int ret = -ENOMEM; sigset_t st; void *bufs; switch (bench) { case XSK_BENCH_RXDROP: rx = true; break; case XSK_BENCH_TXONLY: tx = true; break; case XSK_BENCH_L2FWD: rx = true; tx = true; break; } if (opt->unaligned) { mmap_flags = MAP_HUGETLB; umem_flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG; } if (opt->shared_umem) num_xsks = MAX_SOCKS; ctx = calloc(1, sizeof(*ctx)); if (!ctx) return ERR_PTR(-ENOMEM); bufs = mmap(NULL, NUM_FRAMES * opt->frame_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | mmap_flags, -1, 0); if (bufs == MAP_FAILED) { pr_warn("ERROR: mmap failed\n"); goto err; } /* Create sockets... */ umem = xsk_configure_umem(bufs, NUM_FRAMES * opt->frame_size, opt->frame_size, umem_flags); if (IS_ERR(umem)) { ret = PTR_ERR(umem); umem = NULL; goto err; } if (rx) { ret = xsk_populate_fill_ring(umem, opt->frame_size); if (ret) goto err; } for (i = 0; i < num_xsks; i++) { struct xsk_socket_info *xsk = xsk_configure_socket(opt, umem, rx, tx); if (IS_ERR(xsk)) { ret = PTR_ERR(xsk); goto err; } ctx->xsks[ctx->num_socks++] = xsk; } if (opt->busy_poll) { for (i = 0; i < num_xsks; i++) { ret = apply_busy_poll_opts(ctx->xsks[i]->xsk, opt->batch_size); if (ret) { pr_warn("ERROR: Couldn't apply busy poll options: %s\n", strerror(-ret)); goto err; } } } if (opt->irq_string) { ret = -ENOENT; if (get_interrupt_number(ctx, opt->irq_string)) ret = get_irqs(ctx); if (ret < 0) { pr_warn("ERROR: Failed to get irqs for %s\n", opt->irq_string); goto err; } ctx->irqs_at_init = ret; } ret = xsk_set_sched_priority(opt->sched_policy, opt->sched_prio); if (ret) goto err; memcpy((void *)&ctx->opt, opt, sizeof(ctx->opt)); if (bench == XSK_BENCH_TXONLY) { gen_eth_hdr_data(ctx); gen_eth_frames(ctx, umem, opt->frame_size); } ctx->frames_per_pkt = (opt->tx_pkt_size - 1) / XSK_UMEM__DEFAULT_FRAME_SIZE + 1; if (opt->shared_umem || opt->frags) { struct xdp_program *xdp_prog = load_xdp_program(ctx, opt, rx); if (IS_ERR(xdp_prog)) { ret = PTR_ERR(xdp_prog); goto err; } ctx->xdp_prog = xdp_prog; } sigemptyset(&st); sigaddset(&st, SIGQUIT); sigaddset(&st, SIGINT); sigaddset(&st, SIGTERM); if (sigprocmask(SIG_BLOCK, &st, NULL) < 0) { ret = -errno; goto err; } ctx->signal_fd = signalfd(-1, &st, SFD_CLOEXEC | SFD_NONBLOCK); if (ctx->signal_fd < 0) { ret = -errno; goto err; } ctx->bufs = bufs; ctx->umem = umem; ctx->bench = bench; ctx->prev_time = ctx->start_time = get_nsecs(ctx->opt.clock); ctx->tx_cycle_ns = opt->tx_cycle_us * NSEC_PER_USEC; ctx->poll_timeout = POLL_TIMEOUT; ctx->duration = opt->duration * NSEC_PER_SEC; ctx->retries = opt->retries; ctx->extra_stats = opt->extra_stats; ctx->rx = rx; ctx->tx = tx; return ctx; err: if (ctx->xdp_prog) { xdp_program__detach(ctx->xdp_prog, ctx->opt.iface.ifindex, ctx->opt.attach_mode, 0); xdp_program__close(ctx->xdp_prog); } for (i = 0; i < ctx->num_socks; i++) { xsk_socket__delete(ctx->xsks[i]->xsk); free(ctx->xsks[i]); } free(umem); munmap(bufs, NUM_FRAMES * opt->frame_size); free(ctx); return ERR_PTR(ret); } int xsk_start_bench(struct xsk_ctx *ctx, pthread_t *pt) { switch (ctx->bench) { case XSK_BENCH_RXDROP: return pthread_create(pt, NULL, xsk_rx_drop_all, ctx); case XSK_BENCH_L2FWD: return pthread_create(pt, NULL, xsk_l2fwd_all, ctx); case XSK_BENCH_TXONLY: return pthread_create(pt, NULL, xsk_tx_only_all, ctx); default: return -EINVAL; } } int xsk_validate_opts(const struct xsk_opts *opt) { if (opt->attach_mode == XDP_MODE_SKB && opt->copy_mode == XSK_COPY_ZEROCOPY) { pr_warn("Can't use zero-copy and skb mode together.\n"); return -EINVAL; } if (!opt->unaligned && opt->frame_size & (opt->frame_size -1)) { pr_warn("Frame size %u is not a power of two.\n", opt->frame_size); return -EINVAL; } if (opt->use_poll && opt->tx_cycle_us) { pr_warn("Error: --poll and --tx-cycles are both set\n"); return -EINVAL; } if (opt->timestamp && opt->tx_pkt_size < PKTGEN_SIZE_MIN(opt)) { pr_warn("TX packet size %d less than minimum %zu bytes when timestamps are enabled\n", opt->tx_pkt_size, PKTGEN_SIZE_MIN(opt)); return -EINVAL; } if (opt->tx_pkt_size > MAX_PKT_SIZE || opt->tx_pkt_size < MIN_PKT_SIZE) { pr_warn("Invalid packet size %d (min %d max %x)\n", opt->tx_pkt_size, MIN_PKT_SIZE, MAX_PKT_SIZE); return -EINVAL; } return 0; } xdp-tools-1.6.1/lib/util/xdpsock.h000066400000000000000000000051061514310632100170150ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 * * Copyright(c) 2019 - 2022 Intel Corporation. */ #ifndef XDPSOCK_H_ #define XDPSOCK_H_ #include #include #include #include #include #include #include "params.h" #define MAX_SOCKS 4 #define MIN_PKT_SIZE 64 #define MAX_PKT_SIZE 9728 /* Max frame size supported by many NICs */ enum xsk_benchmark_type { XSK_BENCH_RXDROP = 0, XSK_BENCH_TXONLY = 1, XSK_BENCH_L2FWD = 2, }; enum xsk_program_mode { XSK_RXDROP, XSK_SWAP_MACS, }; enum xsk_copy_mode { XSK_COPY_AUTO = 0, XSK_COPY_COPY = XDP_COPY, XSK_COPY_ZEROCOPY = XDP_ZEROCOPY, }; enum xsk_clock { XSK_CLOCK_MONOTONIC = CLOCK_MONOTONIC, XSK_CLOCK_REALTIME = CLOCK_REALTIME, XSK_CLOCK_TAI = CLOCK_TAI, XSK_CLOCK_BOOTTIME = CLOCK_BOOTTIME, }; enum xsk_sched_policy { XSK_SCHED_OTHER = SCHED_OTHER, XSK_SCHED_FIFO = SCHED_FIFO, }; struct xsk_opts { __u32 queue_idx; __u32 interval; __u32 retries; __u32 frame_size; __u32 duration; __u32 batch_size; __u32 sched_prio; bool use_poll; bool no_need_wakeup; bool unaligned; bool extra_stats; bool quiet; bool app_stats; bool busy_poll; bool frags; bool shared_umem; char *irq_string; enum xdp_attach_mode attach_mode; enum xsk_program_mode program_mode; enum xsk_copy_mode copy_mode; enum xsk_clock clock; enum xsk_sched_policy sched_policy; struct iface iface; /* tx-only options */ bool vlan_tag; bool timestamp; __u16 vlan_id; __u16 vlan_pri; __u16 tx_pkt_size; __u32 pkt_fill_pattern; __u32 pkt_count; __u64 tx_cycle_us; struct mac_addr dst_mac; struct mac_addr src_mac; }; struct xsk_ctx { struct xsk_opts opt; unsigned long prev_time; long tx_cycle_diff_min; long tx_cycle_diff_max; double tx_cycle_diff_ave; long tx_cycle_cnt; unsigned long tx_cycle_ns; unsigned long start_time; unsigned long duration; bool benchmark_done; __u32 irq_no; int irqs_at_init; __u32 sequence; int frames_per_pkt; int poll_timeout; __u32 retries; struct xdp_program *xdp_prog; struct xsk_umem_info *umem; void *bufs; int signal_fd; bool extra_stats; unsigned int num_socks; struct xsk_socket_info *xsks[MAX_SOCKS]; __u8 pkt_data[MAX_PKT_SIZE]; enum xsk_benchmark_type bench; bool rx; bool tx; }; int xsk_validate_opts(const struct xsk_opts *opt); struct xsk_ctx *xsk_ctx__create(const struct xsk_opts *opt, enum xsk_benchmark_type bench); void xsk_ctx__destroy(struct xsk_ctx *ctx); int xsk_stats_poller(struct xsk_ctx *ctx); int xsk_start_bench(struct xsk_ctx *ctx, pthread_t *pt); bool xsk_probe_busy_poll(void); #endif /* XDPSOCK_H */ xdp-tools-1.6.1/lib/util/xpcapng.c000066400000000000000000000437561514310632100170120ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 /* * Description: * Simple PcapNG library developed from scratch as no library existed that * met the requirements for xdpdump. It can also be used by other XDP * applications that would like to capture packets for debugging purposes. */ /***************************************************************************** * Include files *****************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include "xpcapng.h" /***************************************************************************** * Simple roundup() macro *****************************************************************************/ #ifndef roundup #define roundup(x, y) ( \ { \ typeof(y) __y = y; \ (((x) + (__y - 1)) / __y) * __y; \ } \ ) #endif /***************************************************************************** * pcapng_dumper structure *****************************************************************************/ struct xpcapng_dumper { int pd_fd; uint32_t pd_interfaces; }; /***************************************************************************** * general pcapng block and option definitions *****************************************************************************/ enum pcapng_block_types { PCAPNG_SECTION_BLOCK = 0x0A0D0D0A, PCAPNG_INTERFACE_BLOCK = 1, PCAPNG_PACKET_BLOCK, PCAPNG_SIMPLE_PACKET_BLOCK, PCAPNG_NAME_RESOLUTION_BLOCK, PCAPNG_INTERFACE_STATS_BLOCK, PCAPNG_ENHANCED_PACKET_BLOCK }; struct pcapng_option { uint16_t po_type; uint16_t po_length; uint8_t po_data[]; } __attribute__((__packed__)); enum pcapng_opt { PCAPNG_OPT_END = 0, PCAPNG_OPT_COMMENT = 1, PCAPNG_OPT_CUSTOME_2988 = 2988, PCAPNG_OPT_CUSTOME_2989 = 2989, PCAPNG_OPT_CUSTOME_19372 = 19372, PCAPNG_OPT_CUSTOME_19373 = 29373 }; /***************************************************************************** * pcapng section header block definitions *****************************************************************************/ struct pcapng_section_header_block { uint32_t shb_block_type; uint32_t shb_block_length; uint32_t shb_byte_order_magic; uint16_t shb_major_version; uint16_t shb_minor_version; uint64_t shb_section_length; uint8_t shb_options[]; /* The options are followed by another: * uint32_t shb_block_length; */ } __attribute__((__packed__)); #define PCAPNG_BYTE_ORDER_MAGIC 0x1A2B3C4D #define PCAPNG_MAJOR_VERSION 1 #define PCAPNG_MINOR_VERSION 0 enum pcapng_opt_shb { PCAPNG_OPT_SHB_HARDWARE = 2, PCAPNG_OPT_SHB_OS, PCAPNG_OPT_SHB_USERAPPL }; /***************************************************************************** * pcapng interface description block definitions *****************************************************************************/ struct pcapng_interface_description_block { uint32_t idb_block_type; uint32_t idb_block_length; uint16_t idb_link_type; uint16_t idb_reserved; uint32_t idb_snap_len; uint8_t idb_options[]; /* The options are followed by another: * uint32_t idb_block_length; */ } __attribute__((__packed__)); enum pcapng_opt_idb { PCAPNG_OPT_IDB_IF_NAME = 2, PCAPNG_OPT_IDB_IF_DESCRIPTION, PCAPNG_OPT_IDB_IF_IPV4_ADDR, PCAPNG_OPT_IDB_IF_IPV6_ADDR, PCAPNG_OPT_IDB_IF_MAC_ADDR, PCAPNG_OPT_IDB_IF_EUI_ADDR, PCAPNG_OPT_IDB_IF_SPEED, PCAPNG_OPT_IDB_IF_TSRESOL, PCAPNG_OPT_IDB_IF_TZONE, PCAPNG_OPT_IDB_IF_FILTER, PCAPNG_OPT_IDB_IF_OS, PCAPNG_OPT_IDB_IF_FCSLEN, PCAPNG_OPT_IDB_IF_TOFFSET, PCAPNG_OPT_IDB_IF_HARDWARE }; /***************************************************************************** * pcapng interface description block definitions *****************************************************************************/ struct pcapng_enhanced_packet_block { uint32_t epb_block_type; uint32_t epb_block_length; uint32_t epb_interface_id; uint32_t epb_timestamp_hi; uint32_t epb_timestamp_low; uint32_t epb_captured_length; uint32_t epb_original_length; uint8_t epb_packet_data[]; /* The packet data is followed by: * uint8_t epb_options[]; * uint32_t epb_block_length; */ } __attribute__((__packed__)); enum pcapng_opt_epb { PCAPNG_OPT_EPB_FLAGS = 2, PCAPNG_OPT_EPB_HASH, PCAPNG_OPT_EPB_DROPCOUNT, PCAPNG_OPT_EPB_PACKETID, PCAPNG_OPT_EPB_QUEUE, PCAPNG_OPT_EPB_VERDICT }; enum pcapng_epb_vedict_type { PCAPNG_EPB_VEDRICT_TYPE_HARDWARE = 0, PCAPNG_EPB_VEDRICT_TYPE_EBPF_TC, PCAPNG_EPB_VEDRICT_TYPE_EBPF_XDP }; /***************************************************************************** * pcapng_get_option_length() *****************************************************************************/ static size_t pcapng_get_option_length(size_t len) { return roundup(sizeof(struct pcapng_option) + len, sizeof(uint32_t)); } /***************************************************************************** * pcapng_add_option() *****************************************************************************/ static struct pcapng_option *pcapng_add_option(struct pcapng_option *opt, uint16_t type, uint16_t length, const void *data) { if (opt == NULL) return NULL; opt->po_type = type; opt->po_length = length; if (data) memcpy(opt->po_data, data, length); return (struct pcapng_option *) ((uint8_t *)opt + pcapng_get_option_length(length)); } /***************************************************************************** * pcapng_write_shb() *****************************************************************************/ static bool pcapng_write_shb(struct xpcapng_dumper *pd, const char *comment, const char *hardware, const char *os, const char *user_application) { int rc; size_t shb_length; struct pcapng_section_header_block *shb; struct pcapng_option *opt; if (pd == NULL) { errno = EINVAL; return false; } /* First calculate the total length of the SHB. */ shb_length = sizeof(*shb); if (comment) shb_length += pcapng_get_option_length(strlen(comment)); if (hardware) shb_length += pcapng_get_option_length(strlen(hardware)); if (os) shb_length += pcapng_get_option_length(strlen(os)); if (user_application) shb_length += pcapng_get_option_length( strlen(user_application)); shb_length += pcapng_get_option_length(0); shb_length += sizeof(uint32_t); /* Allocate the SHB and fill it. */ shb = calloc(1, shb_length); if (shb == NULL) { errno = ENOMEM; return false; } shb->shb_block_type = PCAPNG_SECTION_BLOCK; shb->shb_block_length = shb_length; shb->shb_byte_order_magic = PCAPNG_BYTE_ORDER_MAGIC; shb->shb_major_version = PCAPNG_MAJOR_VERSION; shb->shb_minor_version = PCAPNG_MINOR_VERSION; shb->shb_section_length = UINT64_MAX; /* Add the options and block_length value */ opt = (struct pcapng_option *) &shb->shb_options; if (comment) opt = pcapng_add_option(opt, PCAPNG_OPT_COMMENT, strlen(comment), comment); if (hardware) opt = pcapng_add_option(opt, PCAPNG_OPT_SHB_HARDWARE, strlen(hardware), hardware); if (os) opt = pcapng_add_option(opt, PCAPNG_OPT_SHB_OS, strlen(os), os); if (user_application) opt = pcapng_add_option(opt, PCAPNG_OPT_SHB_USERAPPL, strlen(user_application), user_application); /* WARNING: If a new option is added, make sure the length calculation * above is also updated! */ opt = pcapng_add_option(opt, PCAPNG_OPT_END, 0, NULL); memcpy(opt, &shb->shb_block_length, sizeof(shb->shb_block_length)); /* Write the SHB, and free its memory. */ rc = write(pd->pd_fd, shb, shb_length); free(shb); if ((size_t)rc != shb_length) return false; return true; } /***************************************************************************** * pcapng_write_idb() *****************************************************************************/ static bool pcapng_write_idb(struct xpcapng_dumper *pd, const char *name, uint16_t snap_len, const char *description, const uint8_t *mac, uint64_t speed, uint8_t ts_resolution, const char *hardware) { int rc; size_t idb_length; struct pcapng_interface_description_block *idb; struct pcapng_option *opt; if (pd == NULL) { errno = EINVAL; return false; } /* First calculate the total length of the IDB. */ idb_length = sizeof(*idb); if (name) idb_length += pcapng_get_option_length(strlen(name)); if (description) idb_length += pcapng_get_option_length(strlen(description)); if (mac) idb_length += pcapng_get_option_length(6); if (speed) idb_length += pcapng_get_option_length(sizeof(uint64_t)); if (ts_resolution != 6 && ts_resolution != 0) idb_length += pcapng_get_option_length(1); if (hardware) idb_length += pcapng_get_option_length(strlen(hardware)); idb_length += pcapng_get_option_length(0); idb_length += sizeof(uint32_t); /* Allocate the IDB and fill it. */ idb = calloc(1, idb_length); if (idb == NULL) { errno = ENOMEM; return false; } idb->idb_block_type = PCAPNG_INTERFACE_BLOCK; idb->idb_block_length = idb_length; idb->idb_link_type = 1; /* Ethernet */ idb->idb_snap_len = snap_len; /* Add the options and block_length value */ opt = (struct pcapng_option *) &idb->idb_options; if (name) opt = pcapng_add_option(opt, PCAPNG_OPT_IDB_IF_NAME, strlen(name), name); if (description) opt = pcapng_add_option(opt, PCAPNG_OPT_IDB_IF_DESCRIPTION, strlen(description), description); if (mac) opt = pcapng_add_option(opt, PCAPNG_OPT_IDB_IF_MAC_ADDR, 6, mac); if (speed) opt = pcapng_add_option(opt, PCAPNG_OPT_IDB_IF_SPEED, sizeof(uint64_t), &speed); if (ts_resolution != 6 && ts_resolution != 0) opt = pcapng_add_option(opt, PCAPNG_OPT_IDB_IF_TSRESOL, sizeof(uint8_t), &ts_resolution); if (hardware) opt = pcapng_add_option(opt, PCAPNG_OPT_IDB_IF_HARDWARE, strlen(hardware), hardware); /* WARNING: If a new option is added, make sure the length calculation * above is also updated! */ opt = pcapng_add_option(opt, PCAPNG_OPT_END, 0, NULL); memcpy(opt, &idb->idb_block_length, sizeof(idb->idb_block_length)); /* Write the IDB, and free it's memory. */ rc = write(pd->pd_fd, idb, idb_length); free(idb); if ((size_t)rc != idb_length) return false; return true; } /***************************************************************************** * pcapng_write_epb() *****************************************************************************/ static bool pcapng_write_epb(struct xpcapng_dumper *pd, uint32_t ifid, const uint8_t *pkt, uint32_t len, uint32_t caplen, uint64_t timestamp, struct xpcapng_epb_options_s *epb_options) { int i = 0; int rc; size_t pad_length; size_t com_length = 0; size_t epb_length; struct pcapng_enhanced_packet_block epb; struct pcapng_option *opt; struct iovec iov[7]; static uint8_t pad[4] = {0, 0, 0, 0}; uint8_t options[8 + 12 + 12 + 8 + 16 + 4 + 4]; /* PCAPNG_OPT_EPB_FLAGS[8] + * PCAPNG_OPT_EPB_DROPCOUNT[12] + * PCAPNG_OPT_EPB_PACKETID[12] + * PCAPNG_OPT_EPB_QUEUE[8] + * PCAPNG_OPT_EPB_VERDICT[16] + * PCAPNG_OPT_END[4] + * epb_block_length */ static struct xdp_verdict { uint8_t type; int64_t verdict; }__attribute__((__packed__)) verdict = { PCAPNG_EPB_VEDRICT_TYPE_EBPF_XDP, 0 }; if (pd == NULL) { errno = EINVAL; return false; } /* First calculate the total length of the EPB. */ pad_length = roundup(caplen, sizeof(uint32_t)) - caplen; epb_length = sizeof(epb); epb_length += caplen + pad_length; if (epb_options->flags) epb_length += pcapng_get_option_length(sizeof(uint32_t)); if (epb_options->dropcount) epb_length += pcapng_get_option_length(sizeof(uint64_t)); if (epb_options->packetid) epb_length += pcapng_get_option_length(sizeof(uint64_t)); if (epb_options->queue) epb_length += pcapng_get_option_length(sizeof(uint32_t)); if (epb_options->xdp_verdict) epb_length += pcapng_get_option_length(sizeof(verdict)); if (epb_options->comment) { com_length = strlen(epb_options->comment); epb_length += pcapng_get_option_length(com_length); } epb_length += pcapng_get_option_length(0); epb_length += sizeof(uint32_t); /* Fill in the EPB. */ epb.epb_block_type = PCAPNG_ENHANCED_PACKET_BLOCK; epb.epb_block_length = epb_length; epb.epb_interface_id = ifid; epb.epb_timestamp_hi = timestamp >> 32; epb.epb_timestamp_low = (uint32_t) timestamp; epb.epb_captured_length = caplen; epb.epb_original_length = len; /* Add the flag/end option and block_length value */ opt = (struct pcapng_option *) options; if (epb_options->flags) opt = pcapng_add_option(opt, PCAPNG_OPT_EPB_FLAGS, sizeof(uint32_t), &epb_options->flags); if (epb_options->dropcount) opt = pcapng_add_option(opt, PCAPNG_OPT_EPB_DROPCOUNT, sizeof(uint64_t), &epb_options->dropcount); if (epb_options->packetid) opt = pcapng_add_option(opt, PCAPNG_OPT_EPB_PACKETID, sizeof(uint64_t), epb_options->packetid); if (epb_options->queue) opt = pcapng_add_option(opt, PCAPNG_OPT_EPB_QUEUE, sizeof(uint32_t), epb_options->queue); if (epb_options->xdp_verdict) { verdict.verdict = *epb_options->xdp_verdict; opt = pcapng_add_option(opt, PCAPNG_OPT_EPB_VERDICT, sizeof(verdict), &verdict); } /* WARNING: If a new option is added, make sure the length calculation * and the options[] variable above are also updated! */ opt = pcapng_add_option(opt, PCAPNG_OPT_END, 0, NULL); memcpy(opt, &epb.epb_block_length, sizeof(epb.epb_block_length)); /* Write the EPB in parts, including the options, this looks not as * straightforward as pcapng_write_idb() but here we would like to * avoid as many memcopy's as possible. */ /* Add base EPB structure. */ iov[i].iov_base = &epb; iov[i++].iov_len = sizeof(epb); /* Add Packet Data. */ iov[i].iov_base = (void *)pkt; iov[i++].iov_len = caplen; /* Add Packet Data padding if needed. */ if (pad_length > 0) { iov[i].iov_base = pad; iov[i++].iov_len = pad_length; } /* Add comment if supplied */ if (epb_options->comment) { uint16_t opt[2] = {PCAPNG_OPT_COMMENT, com_length}; size_t opt_pad = roundup(com_length, sizeof(uint32_t)) - com_length; /* Add option header. */ iov[i].iov_base = opt; iov[i++].iov_len = sizeof(opt); /* Add actual comment string. */ iov[i].iov_base = (void *)epb_options->comment; iov[i++].iov_len = com_length; /* Add padding to uint32_t if needed. */ if (opt_pad) { iov[i].iov_base = pad; iov[i++].iov_len = opt_pad; } } /* Write other options and final EPB size. */ iov[i].iov_base = options; iov[i++].iov_len = 8 + (epb_options->flags ? 8 : 0) + (epb_options->dropcount ? 12 : 0) + (epb_options->packetid ? 12 : 0) + (epb_options->queue ? 8 : 0) + (epb_options->xdp_verdict ? 16 : 0); rc = writev(pd->pd_fd, iov, i); if ((size_t)rc != epb_length) return false; return true; } /***************************************************************************** * xpcapng_dump_open() *****************************************************************************/ struct xpcapng_dumper *xpcapng_dump_open(const char *file, const char *comment, const char *hardware, const char *os, const char *user_application) { struct xpcapng_dumper *pd = NULL; if (file == NULL) { errno = EINVAL; goto error_exit; } pd = calloc(1, sizeof(*pd)); if (pd == NULL) { errno = ENOMEM; goto error_exit; } pd->pd_fd = -1; if (strcmp(file, "-") == 0) { pd->pd_fd = STDOUT_FILENO; } else { pd->pd_fd = open(file, O_WRONLY | O_CREAT | O_TRUNC, 0600); if (pd->pd_fd < 0) goto error_exit; } if (!pcapng_write_shb(pd, comment, hardware, os, user_application)) goto error_exit; return pd; error_exit: if (pd) { if (pd->pd_fd >= 0 && pd->pd_fd != STDOUT_FILENO) close(pd->pd_fd); free(pd); } return NULL; } /***************************************************************************** * xpcapng_dump_close() *****************************************************************************/ void xpcapng_dump_close(struct xpcapng_dumper *pd) { if (pd == NULL) return; if (pd->pd_fd < 0 && pd->pd_fd != STDOUT_FILENO) close(pd->pd_fd); free(pd); } /***************************************************************************** * xpcapng_dump_flush() *****************************************************************************/ int xpcapng_dump_flush(struct xpcapng_dumper *pd) { if (pd != NULL) return fsync(pd->pd_fd); errno = EINVAL; return -1; } /***************************************************************************** * pcapng_dump_add_interface() *****************************************************************************/ int xpcapng_dump_add_interface(struct xpcapng_dumper *pd, uint16_t snap_len, const char *name, const char *description, const uint8_t *mac, uint64_t speed, uint8_t ts_resolution, const char *hardware) { if (!pcapng_write_idb(pd, name, snap_len, description, mac, speed, ts_resolution, hardware)) return -1; return pd->pd_interfaces++; } /***************************************************************************** * xpcapng_dump_enhanced_pkt() *****************************************************************************/ bool xpcapng_dump_enhanced_pkt(struct xpcapng_dumper *pd, uint32_t ifid, const uint8_t *pkt, uint32_t len, uint32_t caplen, uint64_t timestamp, struct xpcapng_epb_options_s *options) { struct xpcapng_epb_options_s default_options = {}; return pcapng_write_epb(pd, ifid, pkt, len, caplen, timestamp, options ?: &default_options); } xdp-tools-1.6.1/lib/util/xpcapng.h000066400000000000000000000044141514310632100170030ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ /***************************************************************************** * Multiple include protection *****************************************************************************/ #ifndef __XPCAPNG_H_ #define __XPCAPNG_H_ /***************************************************************************** * Handle *****************************************************************************/ struct xpcapng_dumper; /***************************************************************************** * Flag variables *****************************************************************************/ enum xpcapng_epb_flags { PCAPNG_EPB_FLAG_INBOUND = 0x1, PCAPNG_EPB_FLAG_OUTBOUND = 0x2 }; /***************************************************************************** * EPB options structure *****************************************************************************/ struct xpcapng_epb_options_s { enum xpcapng_epb_flags flags; uint64_t dropcount; uint64_t *packetid; uint32_t *queue; int64_t *xdp_verdict; const char *comment; }; /***************************************************************************** * APIs *****************************************************************************/ extern struct xpcapng_dumper *xpcapng_dump_open(const char *file, const char *comment, const char *hardware, const char *os, const char *user_application); extern void xpcapng_dump_close(struct xpcapng_dumper *pd); extern int xpcapng_dump_flush(struct xpcapng_dumper *pd); extern int xpcapng_dump_add_interface(struct xpcapng_dumper *pd, uint16_t snap_len, const char *name, const char *description, const uint8_t *mac, uint64_t speed, uint8_t ts_resolution, const char *hardware); extern bool xpcapng_dump_enhanced_pkt(struct xpcapng_dumper *pd, uint32_t ifid, const uint8_t *pkt, uint32_t len, uint32_t caplen, uint64_t timestamp, struct xpcapng_epb_options_s *options); /***************************************************************************** * End-of include file *****************************************************************************/ #endif /* __XPCAPNG_H_ */ xdp-tools-1.6.1/mkarchive.sh000077500000000000000000000013531514310632100157560ustar00rootroot00000000000000#!/bin/bash WORKDIR=$(dirname "${BASH_SOURCE[0]}") VERSION="${1:-$(make -f $WORKDIR/version.mk)}" OUTFILE="$WORKDIR/xdp-tools-$VERSION.tar.gz" PREFIX=xdp-tools-$VERSION TMPDIR=$(mktemp -d) set -o errexit set -o nounset trap 'status=$?; rm -rf $TMPDIR; exit $status' EXIT HUP INT QUIT TERM [ -d .git ] || exit 1 if git status -s | grep -Eq '^ ?[AM]'; then echo "Please commit changes first" >&2 exit 1 fi git archive -o "$TMPDIR/xdp-tools.tar.gz" --prefix "${PREFIX}/" HEAD ( cd lib/libbpf && git archive -o "$TMPDIR/libbpf.tar.gz" --prefix "${PREFIX}/lib/libbpf/" HEAD) tar -C "$TMPDIR" -xzf "$TMPDIR/xdp-tools.tar.gz" tar -C "$TMPDIR" -xzf "$TMPDIR/libbpf.tar.gz" tar -C "$TMPDIR" -czf "$OUTFILE" "$PREFIX" echo "Created $OUTFILE" xdp-tools-1.6.1/packaging/000077500000000000000000000000001514310632100153705ustar00rootroot00000000000000xdp-tools-1.6.1/packaging/docker/000077500000000000000000000000001514310632100166375ustar00rootroot00000000000000xdp-tools-1.6.1/packaging/docker/Dockerfile000066400000000000000000000013321514310632100206300ustar00rootroot00000000000000FROM debian:13 AS builder ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get install -y \ build-essential \ clang \ llvm \ libelf-dev \ libpcap-dev \ libbpf-dev \ pkg-config \ git \ wget \ m4 \ zlib1g-dev \ libcap-dev \ bpftool \ && rm -rf /var/lib/apt/lists/* COPY . /opt/xdp-tools WORKDIR /opt/xdp-tools RUN ./configure && make && make install FROM debian:13-slim AS runtime RUN apt-get update && apt-get install -y \ libelf1 \ libpcap0.8 \ libbpf1 \ libcap2 \ bpftool \ && rm -rf /var/lib/apt/lists/* COPY --from=builder /usr/local /usr/local ENV PATH="/usr/local/bin:/usr/local/sbin:$PATH" ENTRYPOINT [] CMD ["xdp-loader"] xdp-tools-1.6.1/packaging/docker/docker.sh000077500000000000000000000003531514310632100204460ustar00rootroot00000000000000#!/usr/bin/bash docker run -it --rm \ --privileged \ --network=host \ --cap-add=ALL \ --volume /lib/modules:/lib/modules:ro \ --volume /sys:/sys \ --volume $(pwd):/work \ littlejo/xdp-tools:0.2.0 \ bash xdp-tools-1.6.1/packaging/rpm/000077500000000000000000000000001514310632100161665ustar00rootroot00000000000000xdp-tools-1.6.1/packaging/rpm/README.org000066400000000000000000000114541514310632100176410ustar00rootroot00000000000000#+OPTIONS: ^:nil * Releasing and packaging a new version of xdp-tools These are the steps needed to release a new version of xdp-tools. If any of the steps fail, go back and fix the error, then start over from the appropriate step. If the fix requires changes to the sources, commit those, then rewrite the commit made in (1.) on top of this and start over from the beginning. This ensures that we don't end up with a whole series of package revisions just to fix minor errors. ** Steps To release a new version of xdp-tools, follow these steps: 1. Make sure you have valid Kerberos tickets exist for the Fedora and RHEL infrastructure: #+begin_src sh kinit @FEDORAPROJECT.ORG kinit @REDHAT.COM #+end_src 2. Checkout this git repository as well as the RHEL and Fedora dist-git's. Also, make sure you move to the correct branch for RHEL and Fedora. Below is an example of how you could do this: #+begin_src sh export VERSION="" mkdir release_$VERSION cd release_$VERSION git clone git@github.com:xdp-project/xdp-tools.git cd xdp-tools export XDP_TOOLS_DIR=$(pwd) rhpkg clone xdp-tools xdp-tools-RHEL cd xdp-tools-RHEL rhpkg switch-branch -l rhpkg switch-branch rhel-8.3.0 export RHEL_DISTGIT_DIR=$(pwd) cd .. fedpkg clone xdp-tools xdp-tools-FEDORA cd xdp-tools-FEDORA fedpkg switch-branch -l export FEDORA_DISTGIT_DIR=$(pwd) cd .. #+end_src 3. Bump version in =version.mk= and =packacing/rpm/xdp-tools.spec= -- don't forget a changelog entry in the latter. Commit this, bug don't tag and push until the rest of the steps below completed successfully. 4. Run =make archive= to generate a source tarball (xdp-tools-$VERSION.tar.gz). #+begin_src sh cd $XDP_TOOLS_DIR git submodule init git submodule update make archive #+end_src 5. Copy source tarball to =~/rpmbuild/SOURCES=. #+begin_src sh mkdir -p ~/rpmbuild/SOURCES/ cp ./xdp-tools-$VERSION.tar.gz ~/rpmbuild/SOURCES/ #+end_src 6. Build local package using =rpmbuild -ba packaging/rpm/xdp-tools.spec=. 7. Check that building a scratch build on Fedora infrastructure works: =cd $FEDORA_DISTGIT_DIR && fedpkg scratch-build --srpm ~/rpmbuild/SRPMS/xdp-tools-$VERSION.fcXX.src.rpm= 7. Sync the xdp-tools.spec file to dist-git (but don't commit anything yet): - For both RHEL and Fedora, copy over the new version, then manually inspect the git diff and revert any changes that undo previous modifications in that distro. For Fedora, this is mainly changelog entries by rebuild bots, and for RHEL it's mainly the changelog, the =__brp_strip= defines and the symlinks to earlier .so versions for libxdp. - For RHEL also manually create a new symlink entry to the previous .so-version and add it to the file list. Make sure to be on the right branch in each dist-git. 8. Create an SRPM and scratch build for RHEL (in RHEL dist-git directory): #+begin_src sh cd $RHEL_DISTGIT_DIR cp ~/rpmbuild/SOURCES/xdp-tools-$VERSION.tar.gz . rhpkg srpm rhpkg scratch-build --srpm xdp-tools-$VERSION.el8.src.rpm #+end_src 9. Upload new sources files to both Fedora and RHEL - this will also update the 'sources' file in each directory, which is why we didn't commit anything earlier: #+begin_src sh cd $FEDORA_DISTGIT_DIR fedpkg new-sources ~/rpmbuild/SOURCES/xdp-tools-$VERSION.tar.gz git add xdp-tools.spec git commit cd $RHEL_DISTGIT_DIR rhpkg new-sources ~/rpmbuild/SOURCES/xdp-tools-$VERSION.tar.gz git add xdp-tools.spec git commit #+end_src For both, check the git history for commit message inspiration. In particular, to be accepted into the RHEL dist-git, the commit message must reference a valid Bugzilla ID. See the commit log for earlier commits for syntax for this. 10. Push the dist-git repositories and request builds for each: #+begin_src sh cd $FEDORA_DISTGIT_DIR git push fedpkg build cd $RHEL_DISTGIT_DIR git push rhpkg build #+end_src 11. Tag the commit in the xdp-tools repo and push the branch and tags to github. Tag syntax is =v$VERSION=, where =~betaX= becomes =-betaX= (git doesn't allow tildes in tag names). 12. Wait for the CI gating emails to tick in. Check any failures in the CI dashboard and waive and/or fix as necessary. Then talk to QE to have them run the =manual.sst_networking.xdp-tools.tier1= tests and mark it as completed; this will cause the build to be tagged rhel-$VERSION-candidate (from rhel-$VERSION-gate) and allow it to proceed. 13. Add the new build to the errata; this may entail moving the errata status back to =NEW_FILES=. After adding the new build, it should be moved to QE state; if this is not immediately possible, just resolve any issues blocking it. xdp-tools-1.6.1/packaging/rpm/xdp-tools.spec000066400000000000000000000134071514310632100210000ustar00rootroot00000000000000Name: xdp-tools Version: 1.6.1 Release: 1%{?dist} Summary: Utilities and example programs for use with XDP %global _soversion 1.6.0 License: GPL-2.0-only URL: https://github.com/xdp-project/%{name} Source0: https://github.com/xdp-project/%{name}/releases/download/v%{version}/xdp-tools-%{version}.tar.gz BuildRequires: kernel-headers BuildRequires: libbpf-devel BuildRequires: elfutils-libelf-devel BuildRequires: zlib-devel BuildRequires: libpcap-devel BuildRequires: libcap-ng-devel BuildRequires: clang >= 10.0.0 BuildRequires: llvm >= 10.0.0 BuildRequires: make BuildRequires: gcc BuildRequires: pkgconfig BuildRequires: m4 BuildRequires: emacs-nox BuildRequires: wireshark-cli BuildRequires: bpftool ExcludeArch: i386 i686 # Always keep xdp-tools and libxdp packages in sync Requires: libxdp = %{version}-%{release} %global _hardened_build 1 %description Utilities and example programs for use with XDP %package -n libxdp Summary: XDP helper library License: LGPL-2.1-only OR BSD-2-Clause %package -n libxdp-devel Summary: Development files for libxdp License: LGPL-2.1-only OR BSD-2-Clause Requires: kernel-headers Requires: libxdp = %{version}-%{release} %package -n libxdp-static Summary: Static library files for libxdp License: LGPL-2.1-only OR BSD-2-Clause Requires: libxdp-devel = %{version}-%{release} %description -n libxdp The libxdp package contains the libxdp library for managing XDP programs, used by the %{name} package %description -n libxdp-devel The libxdp-devel package contains headers used for building XDP programs using libxdp. %description -n libxdp-static The libxdp-static package contains the static library version of libxdp. %prep %autosetup -p1 -n %{name}-%{version} %build export CFLAGS='%{build_cflags}' export LDFLAGS='%{build_ldflags}' export LIBDIR='%{_libdir}' export RUNDIR='%{_rundir}' export CLANG=$(which clang) export PRODUCTION=1 export DYNAMIC_LIBXDP=1 export FORCE_SYSTEM_LIBBPF=1 export FORCE_EMACS=1 ./configure make %{?_smp_mflags} V=1 %install export DESTDIR='%{buildroot}' export SBINDIR='%{_sbindir}' export LIBDIR='%{_libdir}' export RUNDIR='%{_rundir}' export MANDIR='%{_mandir}' export DATADIR='%{_datadir}' export HDRDIR='%{_includedir}/xdp' make install V=1 %files %{_sbindir}/xdp-bench %{_sbindir}/xdp-filter %{_sbindir}/xdp-forward %{_sbindir}/xdp-loader %{_sbindir}/xdp-monitor %{_sbindir}/xdp-trafficgen %{_sbindir}/xdpdump %{_mandir}/man8/* %{_libdir}/bpf/xdpfilt_*.o %{_libdir}/bpf/xdpdump_*.o %{_datadir}/xdp-tools/ %license LICENSES/* %files -n libxdp %{_libdir}/libxdp.so.1 %{_libdir}/libxdp.so.%{_soversion} %{_libdir}/bpf/xdp-dispatcher.o %{_libdir}/bpf/xsk_def_xdp_prog*.o %{_mandir}/man3/* %license LICENSES/* %files -n libxdp-static %{_libdir}/libxdp.a %files -n libxdp-devel %{_includedir}/xdp/*.h %{_libdir}/libxdp.so %{_libdir}/pkgconfig/libxdp.pc %changelog * Wed Feb 11 2026 Toke Høiland-Jørgensen 1.6.1-1 - Upstream version bump * Tue Jan 6 2026 Toke Høiland-Jørgensen 1.6.0-1 - Upstream version bump * Sat Nov 29 2025 Toke Høiland-Jørgensen 1.5.8-1 - Upstream version bump * Fri Oct 3 2025 Toke Høiland-Jørgensen 1.5.7-1 - Upstream version bump * Fri Aug 15 2025 Toke Høiland-Jørgensen 1.5.6-1 - Upstream version bump * Mon May 26 2025 Toke Høiland-Jørgensen 1.5.5-1 - Upstream version bump * Mon Apr 28 2025 Toke Høiland-Jørgensen 1.5.4-1 - Upstream version bump * Fri Mar 7 2025 Toke Høiland-Jørgensen 1.5.3-1 - Upstream version bump * Wed Feb 19 2025 Toke Høiland-Jørgensen 1.5.2-1 - Upstream version bump * Tue Jan 14 2025 Toke Høiland-Jørgensen 1.5.1-1 - Upstream version bump * Thu Jan 9 2025 Toke Høiland-Jørgensen 1.5.0-1 - Upstream version bump * Tue Aug 6 2024 Toke Høiland-Jørgensen 1.4.3-1 - Upstream version bump * Tue Jan 30 2024 Toke Høiland-Jørgensen 1.4.2-1 - Upstream version bump * Fri Oct 20 2023 Toke Høiland-Jørgensen 1.4.1-1 - Upstream version bump * Thu Jul 6 2023 Toke Høiland-Jørgensen 1.4.0-1 - Upstream version bump * Thu Feb 23 2023 Toke Høiland-Jørgensen 1.3.1-1 - Upstream version bump * Tue Feb 7 2023 Toke Høiland-Jørgensen 1.3.0-1 - Upstream version bump * Thu Jan 20 2022 Toke Høiland-Jørgensen 1.2.2-1 - Upstream version bump * Thu Jan 13 2022 Toke Høiland-Jørgensen 1.2.1-1 - Upstream version bump * Wed Jul 7 2021 Toke Høiland-Jørgensen 1.2.0-1 - Upstream version bump * Wed Feb 3 2021 Toke Høiland-Jørgensen 1.1.1-1 - Upstream version bump * Mon Jan 4 2021 Toke Høiland-Jørgensen 1.1.0-1 - Upstream version bump * Thu Aug 20 2020 Toke Høiland-Jørgensen 1.0.1-1 - Upstream version bump * Tue Aug 18 2020 Toke Høiland-Jørgensen 1.0.0-1 - Upstream version bump * Wed Jul 15 2020 Eelco Chaudron 1.0.0~beta3-0.1 - Upstream version bump * Fri Jul 10 2020 Toke Høiland-Jørgensen 1.0.0~beta2-0.1 - Upstream version bump * Mon Jun 15 2020 Toke Høiland-Jørgensen 1.0.0~beta1-0.1 - Upstream version bump * Mon Apr 6 2020 Toke Høiland-Jørgensen 0.0.3-1 - Upstream update, add libxdp sub-packages * Thu Nov 21 2019 Toke Høiland-Jørgensen 0.0.2-1 - Upstream update * Fri Nov 8 2019 Toke Høiland-Jørgensen 0.0.1-1 - Initial release xdp-tools-1.6.1/version.mk000066400000000000000000000003451514310632100154640ustar00rootroot00000000000000TOOLS_VERSION := "1.6.1" # Conditionally defined make target makes it possible to print the version # defined above by running 'make -f version.mk' ifeq ($(MAKEFILE_LIST),version.mk) print_version: @echo $(TOOLS_VERSION) endif xdp-tools-1.6.1/xdp-bench/000077500000000000000000000000001514310632100153145ustar00rootroot00000000000000xdp-tools-1.6.1/xdp-bench/.gitignore000066400000000000000000000000121514310632100172750ustar00rootroot00000000000000xdp-bench xdp-tools-1.6.1/xdp-bench/Makefile000066400000000000000000000011561514310632100167570ustar00rootroot00000000000000# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) XDP_TARGETS := xdp_redirect_basic.bpf xdp_redirect_cpumap.bpf xdp_redirect_devmap.bpf \ xdp_redirect_devmap_multi.bpf xdp_basic.bpf BPF_SKEL_TARGETS := $(XDP_TARGETS) # Don't install skeleton object files XDP_OBJ_INSTALL := TOOL_NAME := xdp-bench MAN_PAGE := xdp-bench.8 TEST_FILE := tests/test-xdp-bench.sh USER_TARGETS := xdp-bench USER_EXTRA_C := xdp_redirect_basic.c xdp_redirect_cpumap.c xdp_redirect_devmap.c \ xdp_redirect_devmap_multi.c xdp_basic.c xdp_socket.c EXTRA_USER_DEPS := xdp-bench.h LIB_DIR = ../lib include $(LIB_DIR)/common.mk xdp-tools-1.6.1/xdp-bench/README.org000066400000000000000000000632521514310632100167720ustar00rootroot00000000000000#+EXPORT_FILE_NAME: xdp-bench #+TITLE: xdp-bench #+MAN_CLASS_OPTIONS: :section-id "8\" \"DATE\" \"VERSION\" \"A simple XDP benchmarking tool" # This file serves both as a README on github, and as the source for the man # page; the latter through the org-mode man page export support. # . # To export the man page, simply use the org-mode exporter; (require 'ox-man) if # it's not available. There's also a Makefile rule to export it. * XDP-bench - a simple XDP benchmarking tool XDP-bench is a benchmarking utility for exercising the different operation modes of XDP. It is intended to be a simple program demonstrating the various operating modes; these include dropping packets, hairpin forwarding (using the =XDP_TX= return code), and redirection using the various in-kernel packet redirection facilities. The drop and TX modes support various options to control whether packet data is touched (read or written) before being dropped or transmitted. The redirection modes support using the simple ifindex-based =bpf_redirect= helper, the =bpf_redirect_map= helper using a cpumap as its target, =bpf_redirect_map= using a devmap as its target, and the devmap's broadcast mode which allows redirecting to multiple devices. There is more information on the meaning of the output in both default (terse) and extended output mode, in the *Output Format Description* section below. ** Running xdp-bench The syntax for running xdp-bench is: #+begin_src sh Usage: xdp-bench COMMAND [options] COMMAND can be one of: drop - Drop all packets on an interface pass - Pass all packets to the network stack tx - Transmit packets back out on an interface (hairpin forwarding) redirect - XDP redirect using the bpf_redirect() helper redirect-cpu - XDP CPU redirect using BPF_MAP_TYPE_CPUMAP redirect-map - XDP redirect using BPF_MAP_TYPE_DEVMAP redirect-multi - XDP multi-redirect using BPF_MAP_TYPE_DEVMAP and the BPF_F_BROADCAST flag #+end_src Each command, and its options are explained below. Or use =xdp-bench COMMAND --help= to see the options for each command. * The DROP command In this mode, =xdp-bench= installs an XDP program on an interface that simply drops all packets. There are options to control what to do with the packet before dropping it (touch the packet data or not), as well as which statistics to gather. This is a basic benchmark for the baseline (best-case) performance of XDP on an interface. The syntax for the =drop= command is: =xdp-bench drop [options] = Where == is the name of the interface the XDP program should be installed on. The supported options are: ** -p, --packet-operation Specify which operation should be taken on the packet before dropping it. The following actions are available: #+begin_src sh no-touch - Drop the packet without touching the packet data read-data - Read a field in the packet header before dropping parse-ip - Parse the IP header field before dropping swap-macs - Swap the source and destination MAC addresses before dropping #+end_src Whether to touch the packet before dropping it can have a significant performance impact as this requires bringing packet data into the CPU cache (and flushing it back out if writing). The default for this option is =no-touch=. ** -l, --load-mode Specify which mechanism xdp-bench should use to load (and store) the packet data. The following modes are available: #+begin_src sh dpa - Use traditional Direct Packet Access from the XDP program load-bytes - Use the xdp_load_bytes() and xdp_store_bytes() helper functions #+end_src This can be used to benchmark the various packet access modes supported by the kernel. The default for this option is =dpa=. ** -r, --rxq-stats If set, the XDP program will also gather statistics on which receive queue index each packet was received on. This is displayed in the extended output mode along with per-CPU data (which, depending on the hardware configuration may or may not be equivalent). ** -i, --interval Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. ** -e, --extended Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-\ while the program is running. See also the *Output Format Description* section below. ** -m, --mode Selects the XDP program mode (native or skb). Note that native XDP mode is the default, and loading the redirect program in skb manner is neither performant, nor recommended. However, this option is useful if the interface driver lacks native XDP support, or when simply testing the tool. ** -v, --verbose Enable verbose logging. Supply twice to enable verbose logging from the underlying =libxdp= and =libbpf= libraries. ** --version Show the application version and exit. ** -h, --help Display a summary of the available options * The PASS command In this mode, =xdp-bench= installs an XDP program on an interface that passes all packets to the network stack after processing them (returning =XDP_PASS=). There are options to control what to do with the packet before passing it (touch the packet data or not), as well as which statistics to gather. This is a basic benchmark for the overhead of installing an XDP program on an interface while still running the regular network stack. The syntax for the =pass= command is: =xdp-bench pass [options] = Where == is the name of the interface the XDP program should be installed on. The supported options are: ** -p, --packet-operation Specify which operation should be taken on the packet before passing it. The following actions are available: #+begin_src sh no-touch - Pass the packet without touching the packet data read-data - Read a field in the packet header before passing parse-ip - Parse the IP header field before passing swap-macs - Swap the source and destination MAC addresses before passing #+end_src The default for this option is =no-touch=. ** -l, --load-mode Specify which mechanism xdp-bench should use to load (and store) the packet data. The following modes are available: #+begin_src sh dpa - Use traditional Direct Packet Access from the XDP program load-bytes - Use the xdp_load_bytes() and xdp_store_bytes() helper functions #+end_src This can be used to benchmark the various packet access modes supported by the kernel. The default for this option is =dpa=. ** -r, --rxq-stats If set, the XDP program will also gather statistics on which receive queue index each packet was received on. This is displayed in the extended output mode along with per-CPU data (which, depending on the hardware configuration may or may not be equivalent). ** -i, --interval Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. ** -e, --extended Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-\ while the program is running. See also the *Output Format Description* section below. ** -m, --mode Selects the XDP program mode (native or skb). Note that native XDP mode is the default, and loading the redirect program in skb manner is neither performant, nor recommended. However, this option is useful if the interface driver lacks native XDP support, or when simply testing the tool. ** -v, --verbose Enable verbose logging. Supply twice to enable verbose logging from the underlying =libxdp= and =libbpf= libraries. ** --version Show the application version and exit. ** -h, --help Display a summary of the available options * The TX command In this mode, =xdp-bench= installs an XDP program on an interface that performs so-called "hairpin forwarding", which means each packet is transmitted back out the same interface (using the =XDP_TX= return code).. There are options to control what to do with the packet before transmitting it (touch the packet data or not), as well as which statistics to gather. The syntax for the =tx= command is: =xdp-bench tx [options] = Where == is the name of the interface the XDP program should be installed on. The supported options are: ** -p, --packet-operation Specify which operation should be taken on the packet before transmitting it. The following actions are available: #+begin_src sh no-touch - Transmit the packet without touching the packet data read-data - Read a field in the packet header before transmitting parse-ip - Parse the IP header field before transmitting swap-macs - Swap the source and destination MAC addresses before transmitting #+end_src To allow the packet to be successfully transmitted back to the sender, the MAC addresses have to be swapped, so that the source MAC matches the network device. However, there is a performance overhead in doing swapping, so this option allows this function to be turned off. The default for this option is =swap-macs=. ** -l, --load-mode Specify which mechanism xdp-bench should use to load (and store) the packet data. The following modes are available: #+begin_src sh dpa - Use traditional Direct Packet Access from the XDP program load-bytes - Use the xdp_load_bytes() and xdp_store_bytes() helper functions #+end_src This can be used to benchmark the various packet access modes supported by the kernel. The default for this option is =dpa=. ** -r, --rxq-stats If set, the XDP program will also gather statistics on which receive queue index each packet was received on. This is displayed in the extended output mode along with per-CPU data (which, depending on the hardware configuration may or may not be equivalent). ** -i, --interval Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. ** -e, --extended Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-\ while the program is running. See also the *Output Format Description* section below. ** -m, --mode Selects the XDP program mode (native or skb). Note that native XDP mode is the default, and loading the redirect program in skb manner is neither performant, nor recommended. However, this option is useful if the interface driver lacks native XDP support, or when simply testing the tool. ** -v, --verbose Enable verbose logging. Supply twice to enable verbose logging from the underlying =libxdp= and =libbpf= libraries. ** --version Show the application version and exit. ** -h, --help Display a summary of the available options * The REDIRECT command In this mode, =xdp-bench= sets up packet redirection between the two interfaces supplied on the command line using the =bpf_redirect= BPF helper triggered on packet reception on the ingress interface. The syntax for the =redirect= command is: =xdp-bench redirect [options] = Where == is the name of the input interface from where packets will be redirect to the output interface ==. The supported options are: ** -l, --load-mode Specify which mechanism xdp-bench should use to load (and store) the packet data. The following modes are available: #+begin_src sh dpa - Use traditional Direct Packet Access from the XDP program load-bytes - Use the xdp_load_bytes() and xdp_store_bytes() helper functions #+end_src This can be used to benchmark the various packet access modes supported by the kernel. The default for this option is =dpa=. ** -i, --interval Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. ** -s, --stats Enable statistics for successful redirection. This option comes with a per packet tracing overhead, for recording all successful redirections. ** -e, --extended Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-\ while the program is running. See also the *Output Format Description* section below. ** -m, --mode Selects the XDP program mode (native or skb). Note that native XDP mode is the default, and loading the redirect program in skb manner is neither performant, nor recommended. However, this option is useful if the interface driver lacks native XDP support, or when simply testing the tool. ** -v, --verbose Enable verbose logging. Supply twice to enable verbose logging from the underlying =libxdp= and =libbpf= libraries. ** --version Show the application version and exit. ** -h, --help Display a summary of the available options * The REDIRECT-CPU command In this mode, =xdp-bench= sets up packet redirection using the =bpf_redirect_map= BPF helper triggered on packet reception on the ingress interface, using a cpumap as its target. Hence, this tool can be used to redirect packets on an interface from one CPU to another. In addition to this, the tool then supports redirecting the packet to another output device when it is processed on the target CPU. The syntax for the =redirect-cpu= command is: =xdp-bench redirect-cpu [options] -c 0 ... -c N= Where == is the name of the input interface from where packets will be redirect to the target CPU list specified using =-c=. The supported options are: ** -c, --cpu Specify a possible target CPU index. This option must be passed at least once, and can be passed multiple times to specify a list of CPUs. Which CPU is chosen for a given packet depends on the value of the =--program-mode= option, described below. ** -p, --program-mode Specify a program that embeds a predefined policy deciding how packets are redirected to different CPUs. The following options are available: #+begin_src sh no-touch - Redirect without touching packet data touch - Read packet data before redirecting round-robin - Cycle between target CPUs in a round-robin fashion (for each packet) l4-proto - Choose the target CPU based on the layer-4 protocol of packet l4-filter - Like l4-proto, but drop UDP packets with destination port 9 (used by pktgen) l4-hash - Use source and destination IP hashing to pick target CPU l4-sport - Use modulo of source port to pick target CPU l4-dport - Use modulo of destination port to pick target CPU #+end_src The =no-touch= and =touch= modes always redirect packets to the same CPU (the first value supplied to =--cpu=). The =round-robin= and =l4-hash= modes distribute packets between all the CPUs supplied as =--cpu= arguments, while =l4-proto= and =l4-filter= send TCP and unrecognised packets to CPU index 0, UDP packets to CPU index 1 and ICMP packets to CPU index 2 (where the index refers to the order the actual CPUs are given on the command line). The default for this option is =l4-hash=. ** -r --remote-action If this option is set, a separate program is installed into the cpumap, which will be invoked on the remote CPU after the packet is processed there. The action can be either =drop= or =pass= which will drop the packet or pass it to the regular networking stack, respectively. Or it can be =redirect=, which will cause the packet to be redirected to another interface and transmitted out that interface on the remote CPU. If this option is set to =redirect= the target device must be specified using =--redirect-device=. The default for this option is =disabled=. ** -r, --redirect-device Specify the device to redirect the packet to when it is received on the target CPU. Note that this option can only be specified with =--remote-action redirect=. ** -q, --qsize Set the queue size for the per-CPU cpumap ring buffer used for redirecting packets from multiple CPUs to one CPU. The default value is 2048 packets. ** -x, --stress-mode Stress the cpumap implementation by deallocating and reallocating the cpumap ring buffer on each polling interval. ** -i, --interval Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. ** -s, --stats Enable statistics for successful redirection. This option comes with a per packet tracing overhead, for recording all successful redirections. ** -e, --extended Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-\ while the program is running. See also the *Output Format Description* section below. ** -m, --mode Selects the XDP program mode (native or skb). Note that native XDP mode is the default, and loading the redirect program in skb manner is neither performant, nor recommended. However, this option is useful if the interface driver lacks native XDP support, or when simply testing the tool. ** -v, --verbose Enable verbose logging. Supply twice to enable verbose logging from the underlying =libxdp= and =libbpf= libraries. ** --version Show the application version and exit. ** -h, --help Display a summary of the available options * The REDIRECT-MAP command In this mode, =xdp-bench= sets up packet redirection between two interfaces supplied on the command line using the =bpf_redirect_map()= BPF helper triggered on packet reception on the ingress interface, using a devmap as its target. The syntax for the =redirect-map= command is: =xdp-bench redirect-map [options] = Where == is the name of the input interface from where packets will be redirect to the output interface ==. The supported options are: ** -X, --load-egress Load a program in the devmap entry used for redirection, so that it is invoked after the packet is redirected to the target device, before it is transmitted out of the output interface. The program can be selected via the =egress-mode= option. ** -A, --egress-mode Set egress program to load: #+begin_src sh forward - Update the packet data so its source MAC address matches the one of the destination interface. drop - Drop packet. #+end_src The default for this option is =forward=. ** -i, --interval Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. ** -s, --stats Enable statistics for successful redirection. This option comes with a per packet tracing overhead, for recording all successful redirections. ** -e, --extended Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-\ while the program is running. See also the *Output Format Description* section below. ** -m, --mode Selects the XDP program mode (native or skb). Note that native XDP mode is the default, and loading the redirect program in skb manner is neither performant, nor recommended. However, this option is useful if the interface driver lacks native XDP support, or when simply testing the tool. ** -v, --verbose Enable verbose logging. Supply twice to enable verbose logging from the underlying =libxdp= and =libbpf= libraries. ** --version Show the application version and exit. ** -h, --help Display a summary of the available options * The REDIRECT-MULTI command In this mode, =xdp-bench= sets up one-to-many packet redirection between interfaces supplied on the command line, using the =bpf_redirect_map= BPF helper triggered on packet reception on the ingress interface, using a devmap as its target. The packet is broadcast to all output interfaces specified on the command line, using devmap's packet broadcast feature. The syntax for the =redirect-multi= command is: =xdp-bench redirect-multi [options] ... = Where == is the name of the input interface from where packets will be redirect to one or many output interface(s). The supported options are: ** -X, --load-egress Load a program in the devmap entry used for redirection, so that it is invoked after the packet is redirected to the target device, before it is transmitted out of the output interface. The program can be selected via the =egress-mode= option. ** -A, --egress-mode Set egress program to load: #+begin_src sh forward - Update the packet data so its source MAC address matches the one of the destination interface. drop - Drop packet. #+end_src The default for this option is =forward=. ** -i, --interval Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. ** -s, --stats Enable statistics for successful redirection. This option comes with a per packet tracing overhead, for recording all successful redirections. ** -e, --extended Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-\ while the program is running. See also the *Output Format Description* section below. ** -m, --mode Selects the XDP program mode (native or skb). Note that native XDP mode is the default, and loading the redirect program in skb manner is neither performant, nor recommended. However, this option is useful if the interface driver lacks native XDP support, or when simply testing the tool. ** -v, --verbose Enable verbose logging. Supply twice to enable verbose logging from the underlying =libxdp= and =libbpf= libraries. ** --version Show the application version and exit. ** -h, --help Display a summary of the available options * Output Format Description By default, redirect success statistics are disabled, use =--stats= to enable. The terse output mode is default, extended output mode can be activated using the =--extended= command line option. SIGQUIT (Ctrl + \\) can be used to switch the mode dynamically at runtime. Terse mode displays at most the following fields: #+begin_src sh rx/s Number of packets received per second redir/s Number of packets successfully redirected per second err,drop/s Aggregated count of errors per second (including dropped packets when not using the drop command) xmit/s Number of packets transmitted on the output device per second #+end_src Extended output mode displays at most the following fields: #+begin_src sh FIELD DESCRIPTION receive Displays the number of packets received and errors encountered Whenever an error or packet drop occurs, details of per CPU error and drop statistics will be expanded inline in terse mode. pkt/s - Packets received per second drop/s - Packets dropped per second error/s - Errors encountered per second redirect - Displays the number of packets successfully redirected Errors encountered are expanded under redirect_err field Note that passing -s to enable it has a per packet overhead redir/s - Packets redirected successfully per second redirect_err Displays the number of packets that failed redirection The errno is expanded under this field with per CPU count The recognized errors are: EINVAL: Invalid redirection ENETDOWN: Device being redirected to is down EMSGSIZE: Packet length too large for device EOPNOTSUPP: Operation not supported ENOSPC: No space in ptr_ring of cpumap kthread error/s - Packets that failed redirection per second enqueue to cpu N Displays the number of packets enqueued to bulk queue of CPU N Expands to cpu:FROM->N to display enqueue stats for each CPU enqueuing to CPU N Received packets can be associated with the CPU redirect program is enqueuing packets to. pkt/s - Packets enqueued per second from other CPU to CPU N drop/s - Packets dropped when trying to enqueue to CPU N bulk-avg - Average number of packets processed for each event kthread Displays the number of packets processed in CPUMAP kthread for each CPU Packets consumed from ptr_ring in kthread, and its xdp_stats (after calling CPUMAP bpf prog) are expanded below this. xdp_stats are expanded as a total and then per-CPU to associate it to each CPU's pinned CPUMAP kthread. pkt/s - Packets consumed per second from ptr_ring drop/s - Packets dropped per second in kthread sched - Number of times kthread called schedule() xdp_stats (also expands to per-CPU counts) pass/s - XDP_PASS count for CPUMAP program execution drop/s - XDP_DROP count for CPUMAP program execution redir/s - XDP_REDIRECT count for CPUMAP program execution xdp_exception Displays xdp_exception tracepoint events This can occur due to internal driver errors, unrecognized XDP actions and due to explicit user trigger by use of XDP_ABORTED Each action is expanded below this field with its count hit/s - Number of times the tracepoint was hit per second devmap_xmit Displays devmap_xmit tracepoint events This tracepoint is invoked for successful transmissions on output device but these statistics are not available for generic XDP mode, hence they will be omitted from the output when using SKB mode xmit/s - Number of packets that were transmitted per second drop/s - Number of packets that failed transmissions per second drv_err/s - Number of internal driver errors per second bulk-avg - Average number of packets processed for each event #+end_src * BUGS Please report any bugs on Github: https://github.com/xdp-project/xdp-tools/issues * AUTHOR Earlier xdp-redirect tools were written by Jesper Dangaard Brouer and John Fastabend. They were then rewritten to support more features by Kumar Kartikeya Dwivedi, who also ported them to xdp-tools together with Toke Høiland-Jørgensen. This man page was written by Kumar Kartikeya Dwivedi and Toke Høiland-Jørgensen. xdp-tools-1.6.1/xdp-bench/hash_func01.h000066400000000000000000000024501514310632100175650ustar00rootroot00000000000000/* SPDX-License-Identifier: LGPL-2.1 * * Based on Paul Hsieh's (LGPG 2.1) hash function * From: http://www.azillionmonkeys.com/qed/hash.html */ #define get16bits(d) (*((const __u16 *) (d))) static __always_inline __u32 SuperFastHash(const char *data, int len, __u32 initval) { __u32 hash = initval; __u32 tmp; int rem; if (len <= 0 || data == NULL) return 0; rem = len & 3; len >>= 2; /* Main loop */ #pragma clang loop unroll(full) for (;len > 0; len--) { hash += get16bits (data); tmp = (get16bits (data+2) << 11) ^ hash; hash = (hash << 16) ^ tmp; data += 2*sizeof (__u16); hash += hash >> 11; } /* Handle end cases */ switch (rem) { case 3: hash += get16bits (data); hash ^= hash << 16; hash ^= ((signed char)data[sizeof (__u16)]) << 18; hash += hash >> 11; break; case 2: hash += get16bits (data); hash ^= hash << 11; hash += hash >> 17; break; case 1: hash += (signed char)*data; hash ^= hash << 10; hash += hash >> 1; } /* Force "avalanching" of final 127 bits */ hash ^= hash << 3; hash += hash >> 5; hash ^= hash << 4; hash += hash >> 17; hash ^= hash << 25; hash += hash >> 6; return hash; } xdp-tools-1.6.1/xdp-bench/tests/000077500000000000000000000000001514310632100164565ustar00rootroot00000000000000xdp-tools-1.6.1/xdp-bench/tests/test-xdp-bench.sh000066400000000000000000000144701514310632100216450ustar00rootroot00000000000000XDP_LOADER=${XDP_LOADER:-./xdp-loader} XDP_BENCH=${XDP_BENCH:-./xdp-bench} ALL_TESTS="test_drop test_pass test_tx test_xdp_load_bytes test_rxq_stats test_redirect test_redirect_cpu test_redirect_map test_redirect_map_egress test_redirect_multi test_redirect_multi_egress test_xsk_drop test_xsk_tx" test_basic() { action=$1 export XDP_SAMPLE_IMMEDIATE_EXIT=1 check_run $XDP_BENCH $action $NS -vv check_run $XDP_BENCH $action $NS -p read-data -vv check_run $XDP_BENCH $action $NS -p parse-ip -vv check_run $XDP_BENCH $action $NS -p swap-macs -vv check_run $XDP_BENCH $action $NS -m skb -vv check_run $XDP_BENCH $action $NS -e -vv } test_drop() { test_basic drop } test_pass() { test_basic pass } test_tx() { test_basic tx } test_xdp_load_bytes() { skip_if_missing_xdp_load_bytes export XDP_SAMPLE_IMMEDIATE_EXIT=1 for action in drop pass tx; do check_run $XDP_BENCH $action $NS -l load-bytes -vv check_run $XDP_BENCH $action $NS -p read-data -l load-bytes -vv check_run $XDP_BENCH $action $NS -p parse-ip -l load-bytes -vv check_run $XDP_BENCH $action $NS -p swap-macs -l load-bytes -vv check_run $XDP_BENCH $action $NS -m skb -l load-bytes -vv check_run $XDP_BENCH $action $NS -e -l load-bytes -vv done check_run ip link add dev btest0 type veth peer name btest1 check_run $XDP_BENCH redirect btest0 btest1 -l load-bytes -vv check_run $XDP_BENCH redirect btest0 btest1 -s -l load-bytes -vv check_run $XDP_BENCH redirect btest0 btest1 -m skb -l load-bytes -vv check_run $XDP_BENCH redirect btest0 btest1 -e -l load-bytes -vv ip link del dev btest0 } test_rxq_stats() { skip_if_missing_veth_rxq export XDP_SAMPLE_IMMEDIATE_EXIT=1 check_run $XDP_BENCH drop $NS -r -vv } test_redirect() { export XDP_SAMPLE_IMMEDIATE_EXIT=1 check_run ip link add dev btest0 type veth peer name btest1 check_run $XDP_BENCH redirect btest0 btest1 -vv check_run $XDP_BENCH redirect btest0 btest1 -s -vv check_run $XDP_BENCH redirect btest0 btest1 -m skb -vv check_run $XDP_BENCH redirect btest0 btest1 -e -vv ip link del dev btest0 } test_redirect_cpu() { skip_if_missing_cpumap_attach export XDP_SAMPLE_IMMEDIATE_EXIT=1 check_run ip link add dev btest0 type veth peer name btest1 check_run $XDP_BENCH redirect-cpu btest0 -c 0 -vv check_run $XDP_BENCH redirect-cpu btest0 -c 0 -m skb -vv check_run $XDP_BENCH redirect-cpu btest0 -c 0 -p touch -vv check_run $XDP_BENCH redirect-cpu btest0 -c 0 -p round-robin -vv check_run $XDP_BENCH redirect-cpu btest0 -c 0 -p l4-proto -vv check_run $XDP_BENCH redirect-cpu btest0 -c 0 -p l4-filter -vv check_run $XDP_BENCH redirect-cpu btest0 -c 0 -p l4-hash -vv if is_progmap_supported; then check_run $XDP_BENCH redirect-cpu btest0 -c 0 -r drop -vv check_run $XDP_BENCH redirect-cpu btest0 -c 0 -r pass -vv check_run $XDP_BENCH redirect-cpu btest0 -c 0 -r redirect -D btest1 -vv fi ip link del dev btest0 } test_redirect_map() { export XDP_SAMPLE_IMMEDIATE_EXIT=1 check_run ip link add dev btest0 type veth peer name btest1 check_run $XDP_BENCH redirect-map btest0 btest1 -vv check_run $XDP_BENCH redirect-map btest0 btest1 -s -vv check_run $XDP_BENCH redirect-map btest0 btest1 -m skb -vv check_run $XDP_BENCH redirect-map btest0 btest1 -e -vv ip link del dev btest0 } test_redirect_map_egress() { skip_if_missing_cpumap_attach export XDP_SAMPLE_IMMEDIATE_EXIT=1 check_run ip link add dev btest0 type veth peer name btest1 if is_progmap_supported; then check_run $XDP_BENCH redirect-map btest0 btest1 -X -vv check_run $XDP_BENCH redirect-map btest0 btest1 -X -A forward -vv check_run $XDP_BENCH redirect-map btest0 btest1 -X -A drop -vv fi ip link del dev btest0 } test_redirect_multi() { export XDP_SAMPLE_IMMEDIATE_EXIT=1 check_run ip link add dev btest0 type veth peer name btest1 check_run ip link add dev btest2 type veth peer name btest3 check_run $XDP_BENCH redirect-multi btest0 btest1 btest2 btest3 -vv check_run $XDP_BENCH redirect-multi btest0 btest1 btest2 btest3 -s -vv check_run $XDP_BENCH redirect-multi btest0 btest1 btest2 btest3 -m skb -vv check_run $XDP_BENCH redirect-multi btest0 btest1 btest2 btest3 -e -vv ip link del dev btest0 ip link del dev btest2 } test_redirect_multi_egress() { skip_if_missing_cpumap_attach export XDP_SAMPLE_IMMEDIATE_EXIT=1 is_progmap_supported || export LIBXDP_SKIP_DISPATCHER=1 check_run ip link add dev btest0 type veth peer name btest1 check_run ip link add dev btest2 type veth peer name btest3 check_run $XDP_BENCH redirect-multi btest0 btest1 btest2 btest3 -X -vv check_run $XDP_BENCH redirect-multi btest0 btest1 btest2 btest3 -X -A forward -vv check_run $XDP_BENCH redirect-multi btest0 btest1 btest2 btest3 -X -A drop -vv ip link del dev btest0 ip link del dev btest2 } test_xsk_one() { action=$1 shift export XDP_SAMPLE_IMMEDIATE_EXIT=1 check_run ip link add dev btest0 type veth peer name btest1 check_run $XDP_BENCH $action btest0 "$@" ip link del dev btest0 } test_xsk() { local action local res local hugepg action=$1 test_xsk_one $action is_xsk_busy_poll_supported && test_xsk_one $action -B test_xsk_one $action -C copy test_xsk_one $action -F test_xsk_one $action -M test_xsk_one $action -Q test_xsk_one $action -W SCHED_FIFO -U 50 test_xsk_one $action -b 32 test_xsk_one $action -d 1 test_xsk_one $action -f 2048 test_xsk_one $action -m test_xsk_one $action -p test_xsk_one $action -q 0 hugepg=$(cat /proc/sys/vm/nr_hugepages) if [ "$hugepg" -lt "8" ]; then echo 8 > /proc/sys/vm/nr_hugepages res=$? else res=0 fi if [ "$res" = "0" ]; then test_xsk_one $action -u echo $hugepg > /proc/sys/vm/nr_hugepages fi test_xsk_one $action -w BOOTTIME test_xsk_one $action -w MONOTONIC test_xsk_one $action -x -a } test_xsk_drop() { test_xsk xsk-drop } test_xsk_tx() { test_xsk xsk-tx } cleanup_tests() { ip link del dev btest0 >/dev/null 2>&1 ip link del dev btest2 >/dev/null 2>&1 $XDP_LOADER unload $NS --all >/dev/null 2>&1 $XDP_LOADER clean >/dev/null 2>&1 } xdp-tools-1.6.1/xdp-bench/xdp-bench.8000066400000000000000000000665011514310632100172650ustar00rootroot00000000000000.TH "xdp-bench" "8" "DECEMBER 15, 2025" "V1.6.1" "A simple XDP benchmarking tool" .SH "NAME" XDP-bench \- a simple XDP benchmarking tool .SH "SYNOPSIS" .PP XDP-bench is a benchmarking utility for exercising the different operation modes of XDP. It is intended to be a simple program demonstrating the various operating modes; these include dropping packets, hairpin forwarding (using the \fIXDP_TX\fP return code), and redirection using the various in-kernel packet redirection facilities. .PP The drop and TX modes support various options to control whether packet data is touched (read or written) before being dropped or transmitted. The redirection modes support using the simple ifindex-based \fIbpf_redirect\fP helper, the \fIbpf_redirect_map\fP helper using a cpumap as its target, \fIbpf_redirect_map\fP using a devmap as its target, and the devmap's broadcast mode which allows redirecting to multiple devices. .PP There is more information on the meaning of the output in both default (terse) and extended output mode, in the \fBOutput Format Description\fP section below. .SS "Running xdp-bench" .PP The syntax for running xdp-bench is: .RS .nf \fCUsage: xdp-bench COMMAND [options] COMMAND can be one of: drop - Drop all packets on an interface pass - Pass all packets to the network stack tx - Transmit packets back out on an interface (hairpin forwarding) redirect - XDP redirect using the bpf_redirect() helper redirect-cpu - XDP CPU redirect using BPF_MAP_TYPE_CPUMAP redirect-map - XDP redirect using BPF_MAP_TYPE_DEVMAP redirect-multi - XDP multi-redirect using BPF_MAP_TYPE_DEVMAP and the BPF_F_BROADCAST flag \fP .fi .RE .PP Each command, and its options are explained below. Or use \fIxdp\-bench COMMAND \-\-help\fP to see the options for each command. .SH "The DROP command" .PP In this mode, \fIxdp\-bench\fP installs an XDP program on an interface that simply drops all packets. There are options to control what to do with the packet before dropping it (touch the packet data or not), as well as which statistics to gather. This is a basic benchmark for the baseline (best-case) performance of XDP on an interface. .PP The syntax for the \fIdrop\fP command is: .PP \fIxdp\-bench drop [options] \fP .PP Where \fI\fP is the name of the interface the XDP program should be installed on. .PP The supported options are: .SS "-p, --packet-operation " .PP Specify which operation should be taken on the packet before dropping it. The following actions are available: .RS .nf \fCno-touch - Drop the packet without touching the packet data read-data - Read a field in the packet header before dropping parse-ip - Parse the IP header field before dropping swap-macs - Swap the source and destination MAC addresses before dropping \fP .fi .RE .PP Whether to touch the packet before dropping it can have a significant performance impact as this requires bringing packet data into the CPU cache (and flushing it back out if writing). .PP The default for this option is \fIno\-touch\fP. .SS "-l, --load-mode " .PP Specify which mechanism xdp-bench should use to load (and store) the packet data. The following modes are available: .RS .nf \fCdpa - Use traditional Direct Packet Access from the XDP program load-bytes - Use the xdp_load_bytes() and xdp_store_bytes() helper functions \fP .fi .RE .PP This can be used to benchmark the various packet access modes supported by the kernel. .PP The default for this option is \fIdpa\fP. .SS "-r, --rxq-stats" .PP If set, the XDP program will also gather statistics on which receive queue index each packet was received on. This is displayed in the extended output mode along with per-CPU data (which, depending on the hardware configuration may or may not be equivalent). .SS "-i, --interval " .PP Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. .SS "-e, --extended" .PP Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-$\ while the program is running. See also the \fBOutput Format Description\fP section below. .SS "-m, --mode" .PP Selects the XDP program mode (native or skb). Note that native XDP mode is the default, and loading the redirect program in skb manner is neither performant, nor recommended. However, this option is useful if the interface driver lacks native XDP support, or when simply testing the tool. .SS "-v, --verbose" .PP Enable verbose logging. Supply twice to enable verbose logging from the underlying \fIlibxdp\fP and \fIlibbpf\fP libraries. .SS "--version" .PP Show the application version and exit. .SS "-h, --help" .PP Display a summary of the available options .SH "The PASS command" .PP In this mode, \fIxdp\-bench\fP installs an XDP program on an interface that passes all packets to the network stack after processing them (returning \fIXDP_PASS\fP). There are options to control what to do with the packet before passing it (touch the packet data or not), as well as which statistics to gather. This is a basic benchmark for the overhead of installing an XDP program on an interface while still running the regular network stack. .PP The syntax for the \fIpass\fP command is: .PP \fIxdp\-bench pass [options] \fP .PP Where \fI\fP is the name of the interface the XDP program should be installed on. .PP The supported options are: .SS "-p, --packet-operation " .PP Specify which operation should be taken on the packet before passing it. The following actions are available: .RS .nf \fCno-touch - Pass the packet without touching the packet data read-data - Read a field in the packet header before passing parse-ip - Parse the IP header field before passing swap-macs - Swap the source and destination MAC addresses before passing \fP .fi .RE .PP The default for this option is \fIno\-touch\fP. .SS "-l, --load-mode " .PP Specify which mechanism xdp-bench should use to load (and store) the packet data. The following modes are available: .RS .nf \fCdpa - Use traditional Direct Packet Access from the XDP program load-bytes - Use the xdp_load_bytes() and xdp_store_bytes() helper functions \fP .fi .RE .PP This can be used to benchmark the various packet access modes supported by the kernel. .PP The default for this option is \fIdpa\fP. .SS "-r, --rxq-stats" .PP If set, the XDP program will also gather statistics on which receive queue index each packet was received on. This is displayed in the extended output mode along with per-CPU data (which, depending on the hardware configuration may or may not be equivalent). .SS "-i, --interval " .PP Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. .SS "-e, --extended" .PP Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-$\ while the program is running. See also the \fBOutput Format Description\fP section below. .SS "-m, --mode" .PP Selects the XDP program mode (native or skb). Note that native XDP mode is the default, and loading the redirect program in skb manner is neither performant, nor recommended. However, this option is useful if the interface driver lacks native XDP support, or when simply testing the tool. .SS "-v, --verbose" .PP Enable verbose logging. Supply twice to enable verbose logging from the underlying \fIlibxdp\fP and \fIlibbpf\fP libraries. .SS "--version" .PP Show the application version and exit. .SS "-h, --help" .PP Display a summary of the available options .SH "The TX command" .PP In this mode, \fIxdp\-bench\fP installs an XDP program on an interface that performs so-called "hairpin forwarding", which means each packet is transmitted back out the same interface (using the \fIXDP_TX\fP return code).. There are options to control what to do with the packet before transmitting it (touch the packet data or not), as well as which statistics to gather. .PP The syntax for the \fItx\fP command is: .PP \fIxdp\-bench tx [options] \fP .PP Where \fI\fP is the name of the interface the XDP program should be installed on. .PP The supported options are: .SS "-p, --packet-operation " .PP Specify which operation should be taken on the packet before transmitting it. The following actions are available: .RS .nf \fCno-touch - Transmit the packet without touching the packet data read-data - Read a field in the packet header before transmitting parse-ip - Parse the IP header field before transmitting swap-macs - Swap the source and destination MAC addresses before transmitting \fP .fi .RE .PP To allow the packet to be successfully transmitted back to the sender, the MAC addresses have to be swapped, so that the source MAC matches the network device. However, there is a performance overhead in doing swapping, so this option allows this function to be turned off. .PP The default for this option is \fIswap\-macs\fP. .SS "-l, --load-mode " .PP Specify which mechanism xdp-bench should use to load (and store) the packet data. The following modes are available: .RS .nf \fCdpa - Use traditional Direct Packet Access from the XDP program load-bytes - Use the xdp_load_bytes() and xdp_store_bytes() helper functions \fP .fi .RE .PP This can be used to benchmark the various packet access modes supported by the kernel. .PP The default for this option is \fIdpa\fP. .SS "-r, --rxq-stats" .PP If set, the XDP program will also gather statistics on which receive queue index each packet was received on. This is displayed in the extended output mode along with per-CPU data (which, depending on the hardware configuration may or may not be equivalent). .SS "-i, --interval " .PP Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. .SS "-e, --extended" .PP Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-$\ while the program is running. See also the \fBOutput Format Description\fP section below. .SS "-m, --mode" .PP Selects the XDP program mode (native or skb). Note that native XDP mode is the default, and loading the redirect program in skb manner is neither performant, nor recommended. However, this option is useful if the interface driver lacks native XDP support, or when simply testing the tool. .SS "-v, --verbose" .PP Enable verbose logging. Supply twice to enable verbose logging from the underlying \fIlibxdp\fP and \fIlibbpf\fP libraries. .SS "--version" .PP Show the application version and exit. .SS "-h, --help" .PP Display a summary of the available options .SH "The REDIRECT command" .PP In this mode, \fIxdp\-bench\fP sets up packet redirection between the two interfaces supplied on the command line using the \fIbpf_redirect\fP BPF helper triggered on packet reception on the ingress interface. .PP The syntax for the \fIredirect\fP command is: .PP \fIxdp\-bench redirect [options] \fP .PP Where \fI\fP is the name of the input interface from where packets will be redirect to the output interface \fI\fP. .PP The supported options are: .SS "-l, --load-mode " .PP Specify which mechanism xdp-bench should use to load (and store) the packet data. The following modes are available: .RS .nf \fCdpa - Use traditional Direct Packet Access from the XDP program load-bytes - Use the xdp_load_bytes() and xdp_store_bytes() helper functions \fP .fi .RE .PP This can be used to benchmark the various packet access modes supported by the kernel. .PP The default for this option is \fIdpa\fP. .SS "-i, --interval " .PP Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. .SS "-s, --stats" .PP Enable statistics for successful redirection. This option comes with a per packet tracing overhead, for recording all successful redirections. .SS "-e, --extended" .PP Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-$\ while the program is running. See also the \fBOutput Format Description\fP section below. .SS "-m, --mode" .PP Selects the XDP program mode (native or skb). Note that native XDP mode is the default, and loading the redirect program in skb manner is neither performant, nor recommended. However, this option is useful if the interface driver lacks native XDP support, or when simply testing the tool. .SS "-v, --verbose" .PP Enable verbose logging. Supply twice to enable verbose logging from the underlying \fIlibxdp\fP and \fIlibbpf\fP libraries. .SS "--version" .PP Show the application version and exit. .SS "-h, --help" .PP Display a summary of the available options .SH "The REDIRECT-CPU command" .PP In this mode, \fIxdp\-bench\fP sets up packet redirection using the \fIbpf_redirect_map\fP BPF helper triggered on packet reception on the ingress interface, using a cpumap as its target. Hence, this tool can be used to redirect packets on an interface from one CPU to another. In addition to this, the tool then supports redirecting the packet to another output device when it is processed on the target CPU. .PP The syntax for the \fIredirect\-cpu\fP command is: .PP \fIxdp\-bench redirect\-cpu [options] \-c 0 ... \-c N\fP .PP Where \fI\fP is the name of the input interface from where packets will be redirect to the target CPU list specified using \fI\-c\fP. .PP The supported options are: .SS "-c, --cpu " .PP Specify a possible target CPU index. This option must be passed at least once, and can be passed multiple times to specify a list of CPUs. Which CPU is chosen for a given packet depends on the value of the \fI\-\-program\-mode\fP option, described below. .SS "-p, --program-mode " .PP Specify a program that embeds a predefined policy deciding how packets are redirected to different CPUs. The following options are available: .RS .nf \fCno-touch - Redirect without touching packet data touch - Read packet data before redirecting round-robin - Cycle between target CPUs in a round-robin fashion (for each packet) l4-proto - Choose the target CPU based on the layer-4 protocol of packet l4-filter - Like l4-proto, but drop UDP packets with destination port 9 (used by pktgen) l4-hash - Use source and destination IP hashing to pick target CPU l4-sport - Use modulo of source port to pick target CPU l4-dport - Use modulo of destination port to pick target CPU \fP .fi .RE .PP The \fIno\-touch\fP and \fItouch\fP modes always redirect packets to the same CPU (the first value supplied to \fI\-\-cpu\fP). The \fIround\-robin\fP and \fIl4\-hash\fP modes distribute packets between all the CPUs supplied as \fI\-\-cpu\fP arguments, while \fIl4\-proto\fP and \fIl4\-filter\fP send TCP and unrecognised packets to CPU index 0, UDP packets to CPU index 1 and ICMP packets to CPU index 2 (where the index refers to the order the actual CPUs are given on the command line). .PP The default for this option is \fIl4\-hash\fP. .SS "-r --remote-action " .PP If this option is set, a separate program is installed into the cpumap, which will be invoked on the remote CPU after the packet is processed there. The action can be either \fIdrop\fP or \fIpass\fP which will drop the packet or pass it to the regular networking stack, respectively. Or it can be \fIredirect\fP, which will cause the packet to be redirected to another interface and transmitted out that interface on the remote CPU. If this option is set to \fIredirect\fP the target device must be specified using \fI\-\-redirect\-device\fP. .PP The default for this option is \fIdisabled\fP. .SS "-r, --redirect-device " .PP Specify the device to redirect the packet to when it is received on the target CPU. Note that this option can only be specified with \fI\-\-remote\-action redirect\fP. .SS "-q, --qsize " .PP Set the queue size for the per-CPU cpumap ring buffer used for redirecting packets from multiple CPUs to one CPU. The default value is 2048 packets. .SS "-x, --stress-mode" .PP Stress the cpumap implementation by deallocating and reallocating the cpumap ring buffer on each polling interval. .SS "-i, --interval " .PP Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. .SS "-s, --stats" .PP Enable statistics for successful redirection. This option comes with a per packet tracing overhead, for recording all successful redirections. .SS "-e, --extended" .PP Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-$\ while the program is running. See also the \fBOutput Format Description\fP section below. .SS "-m, --mode" .PP Selects the XDP program mode (native or skb). Note that native XDP mode is the default, and loading the redirect program in skb manner is neither performant, nor recommended. However, this option is useful if the interface driver lacks native XDP support, or when simply testing the tool. .SS "-v, --verbose" .PP Enable verbose logging. Supply twice to enable verbose logging from the underlying \fIlibxdp\fP and \fIlibbpf\fP libraries. .SS "--version" .PP Show the application version and exit. .SS "-h, --help" .PP Display a summary of the available options .SH "The REDIRECT-MAP command" .PP In this mode, \fIxdp\-bench\fP sets up packet redirection between two interfaces supplied on the command line using the \fIbpf_redirect_map()\fP BPF helper triggered on packet reception on the ingress interface, using a devmap as its target. .PP The syntax for the \fIredirect\-map\fP command is: .PP \fIxdp\-bench redirect\-map [options] \fP .PP Where \fI\fP is the name of the input interface from where packets will be redirect to the output interface \fI\fP. .PP The supported options are: .SS "-X, --load-egress" .PP Load a program in the devmap entry used for redirection, so that it is invoked after the packet is redirected to the target device, before it is transmitted out of the output interface. The program can be selected via the \fIegress\-mode\fP option. .SS "-A, --egress-mode " .PP Set egress program to load: .RS .nf \fCforward - Update the packet data so its source MAC address matches the one of the destination interface. drop - Drop packet. \fP .fi .RE .PP The default for this option is \fIforward\fP. .SS "-i, --interval " .PP Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. .SS "-s, --stats" .PP Enable statistics for successful redirection. This option comes with a per packet tracing overhead, for recording all successful redirections. .SS "-e, --extended" .PP Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-$\ while the program is running. See also the \fBOutput Format Description\fP section below. .SS "-m, --mode" .PP Selects the XDP program mode (native or skb). Note that native XDP mode is the default, and loading the redirect program in skb manner is neither performant, nor recommended. However, this option is useful if the interface driver lacks native XDP support, or when simply testing the tool. .SS "-v, --verbose" .PP Enable verbose logging. Supply twice to enable verbose logging from the underlying \fIlibxdp\fP and \fIlibbpf\fP libraries. .SS "--version" .PP Show the application version and exit. .SS "-h, --help" .PP Display a summary of the available options .SH "The REDIRECT-MULTI command" .PP In this mode, \fIxdp\-bench\fP sets up one-to-many packet redirection between interfaces supplied on the command line, using the \fIbpf_redirect_map\fP BPF helper triggered on packet reception on the ingress interface, using a devmap as its target. The packet is broadcast to all output interfaces specified on the command line, using devmap's packet broadcast feature. .PP The syntax for the \fIredirect\-multi\fP command is: .PP \fIxdp\-bench redirect\-multi [options] ... \fP .PP Where \fI\fP is the name of the input interface from where packets will be redirect to one or many output interface(s). .PP The supported options are: .SS "-X, --load-egress" .PP Load a program in the devmap entry used for redirection, so that it is invoked after the packet is redirected to the target device, before it is transmitted out of the output interface. The program can be selected via the \fIegress\-mode\fP option. .SS "-A, --egress-mode " .PP Set egress program to load: .RS .nf \fCforward - Update the packet data so its source MAC address matches the one of the destination interface. drop - Drop packet. \fP .fi .RE .PP The default for this option is \fIforward\fP. .SS "-i, --interval " .PP Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. .SS "-s, --stats" .PP Enable statistics for successful redirection. This option comes with a per packet tracing overhead, for recording all successful redirections. .SS "-e, --extended" .PP Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-$\ while the program is running. See also the \fBOutput Format Description\fP section below. .SS "-m, --mode" .PP Selects the XDP program mode (native or skb). Note that native XDP mode is the default, and loading the redirect program in skb manner is neither performant, nor recommended. However, this option is useful if the interface driver lacks native XDP support, or when simply testing the tool. .SS "-v, --verbose" .PP Enable verbose logging. Supply twice to enable verbose logging from the underlying \fIlibxdp\fP and \fIlibbpf\fP libraries. .SS "--version" .PP Show the application version and exit. .SS "-h, --help" .PP Display a summary of the available options .SH "Output Format Description" .PP By default, redirect success statistics are disabled, use \fI\-\-stats\fP to enable. The terse output mode is default, extended output mode can be activated using the \fI\-\-extended\fP command line option. .PP SIGQUIT (Ctrl + \\) can be used to switch the mode dynamically at runtime. .PP Terse mode displays at most the following fields: .RS .nf \fCrx/s Number of packets received per second redir/s Number of packets successfully redirected per second err,drop/s Aggregated count of errors per second (including dropped packets when not using the drop command) xmit/s Number of packets transmitted on the output device per second \fP .fi .RE .PP Extended output mode displays at most the following fields: .RS .nf \fCFIELD DESCRIPTION receive Displays the number of packets received and errors encountered Whenever an error or packet drop occurs, details of per CPU error and drop statistics will be expanded inline in terse mode. pkt/s - Packets received per second drop/s - Packets dropped per second error/s - Errors encountered per second redirect - Displays the number of packets successfully redirected Errors encountered are expanded under redirect_err field Note that passing -s to enable it has a per packet overhead redir/s - Packets redirected successfully per second redirect_err Displays the number of packets that failed redirection The errno is expanded under this field with per CPU count The recognized errors are: EINVAL: Invalid redirection ENETDOWN: Device being redirected to is down EMSGSIZE: Packet length too large for device EOPNOTSUPP: Operation not supported ENOSPC: No space in ptr_ring of cpumap kthread error/s - Packets that failed redirection per second enqueue to cpu N Displays the number of packets enqueued to bulk queue of CPU N Expands to cpu:FROM->N to display enqueue stats for each CPU enqueuing to CPU N Received packets can be associated with the CPU redirect program is enqueuing packets to. pkt/s - Packets enqueued per second from other CPU to CPU N drop/s - Packets dropped when trying to enqueue to CPU N bulk-avg - Average number of packets processed for each event kthread Displays the number of packets processed in CPUMAP kthread for each CPU Packets consumed from ptr_ring in kthread, and its xdp_stats (after calling CPUMAP bpf prog) are expanded below this. xdp_stats are expanded as a total and then per-CPU to associate it to each CPU's pinned CPUMAP kthread. pkt/s - Packets consumed per second from ptr_ring drop/s - Packets dropped per second in kthread sched - Number of times kthread called schedule() xdp_stats (also expands to per-CPU counts) pass/s - XDP_PASS count for CPUMAP program execution drop/s - XDP_DROP count for CPUMAP program execution redir/s - XDP_REDIRECT count for CPUMAP program execution xdp_exception Displays xdp_exception tracepoint events This can occur due to internal driver errors, unrecognized XDP actions and due to explicit user trigger by use of XDP_ABORTED Each action is expanded below this field with its count hit/s - Number of times the tracepoint was hit per second devmap_xmit Displays devmap_xmit tracepoint events This tracepoint is invoked for successful transmissions on output device but these statistics are not available for generic XDP mode, hence they will be omitted from the output when using SKB mode xmit/s - Number of packets that were transmitted per second drop/s - Number of packets that failed transmissions per second drv_err/s - Number of internal driver errors per second bulk-avg - Average number of packets processed for each event \fP .fi .RE .SH "BUGS" .PP Please report any bugs on Github: \fIhttps://github.com/xdp-project/xdp-tools/issues\fP .SH "AUTHOR" .PP Earlier xdp-redirect tools were written by Jesper Dangaard Brouer and John Fastabend. They were then rewritten to support more features by Kumar Kartikeya Dwivedi, who also ported them to xdp-tools together with Toke Høiland-Jørgensen. This man page was written by Kumar Kartikeya Dwivedi and Toke Høiland-Jørgensen. xdp-tools-1.6.1/xdp-bench/xdp-bench.c000066400000000000000000000412461514310632100173370ustar00rootroot00000000000000#define _GNU_SOURCE #include #include #include #include "xdp-bench.h" #include "params.h" #define PROG_NAME "xdp-bench" int do_help(__unused const void *cfg, __unused const char *pin_root_path) { fprintf(stderr, "Usage: xdp-bench COMMAND [options]\n" "\n" "COMMAND can be one of:\n" " drop - Drop all packets on an interface\n" " pass - Pass all packets to the network stack\n" " tx - Transmit packets back out on an interface (hairpin forwarding)\n" " redirect - XDP redirect using the bpf_redirect() helper\n" " redirect-cpu - XDP CPU redirect using BPF_MAP_TYPE_CPUMAP\n" " redirect-map - XDP redirect using BPF_MAP_TYPE_DEVMAP\n" " redirect-multi - XDP multi-redirect using BPF_MAP_TYPE_DEVMAP and the BPF_F_BROADCAST flag\n" " xsk-drop - AF_XDP socket-based drop\n" " xsk-tx - AF_XDP socket-based hairpin forwarding\n" " help - show this help message\n" "\n" "Use 'xdp-bench COMMAND --help' to see options for each command\n"); return -1; } struct enum_val xdp_modes[] = { {"native", XDP_MODE_NATIVE}, {"skb", XDP_MODE_SKB}, {NULL, 0} }; struct enum_val basic_program_modes[] = { {"no-touch", BASIC_NO_TOUCH}, {"read-data", BASIC_READ_DATA}, {"parse-ip", BASIC_PARSE_IPHDR}, {"swap-macs", BASIC_SWAP_MACS}, {NULL, 0} }; struct enum_val basic_load_modes[] = { {"dpa", BASIC_LOAD_DPA}, {"load-bytes", BASIC_LOAD_BYTES}, {NULL, 0} }; struct enum_val cpumap_remote_actions[] = { {"disabled", ACTION_DISABLED}, {"drop", ACTION_DROP}, {"pass", ACTION_PASS}, {"redirect", ACTION_REDIRECT}, {NULL, 0} }; struct enum_val cpumap_program_modes[] = { {"no-touch", CPUMAP_NO_TOUCH}, {"touch", CPUMAP_TOUCH_DATA}, {"round-robin", CPUMAP_CPU_ROUND_ROBIN}, {"l4-proto", CPUMAP_CPU_L4_PROTO}, {"l4-filter", CPUMAP_CPU_L4_PROTO_FILTER}, {"l4-hash", CPUMAP_CPU_L4_HASH}, {"l4-sport", CPUMAP_CPU_L4_SPORT}, {"l4-dport", CPUMAP_CPU_L4_DPORT}, {NULL, 0} }; struct enum_val devmap_egress_actions[] = { {"forward", DEVMAP_EGRESS_FORWARD }, {"drop", DEVMAP_EGRESS_DROP }, {NULL, 0} }; struct enum_val xsk_program_modes[] = { {"rxdrop", XSK_RXDROP}, {"swap-macs", XSK_SWAP_MACS}, {NULL, 0} }; struct enum_val xsk_copy_modes[] = { {"auto", XSK_COPY_AUTO}, {"copy", XSK_COPY_COPY}, {"zero-copy", XSK_COPY_ZEROCOPY}, {NULL, 0} }; struct enum_val xsk_clocks[] = { {"MONOTONIC", XSK_CLOCK_MONOTONIC}, {"REALTIME", XSK_CLOCK_REALTIME}, {"TAI", XSK_CLOCK_TAI}, {"BOOTTIME", XSK_CLOCK_BOOTTIME}, {NULL, 0} }; struct enum_val xsk_sched_policies[] = { {"SCHED_OTHER", XSK_SCHED_OTHER}, {"SCHED_FIFO", XSK_SCHED_FIFO}, {NULL, 0} }; struct prog_option basic_options[] = { DEFINE_OPTION("packet-operation", OPT_ENUM, struct basic_opts, program_mode, .short_opt = 'p', .metavar = "", .typearg = basic_program_modes, .help = "Action to take before dropping packet."), DEFINE_OPTION("program-mode", OPT_ENUM, struct basic_opts, program_mode, .typearg = basic_program_modes, .hidden = true), DEFINE_OPTION("load-mode", OPT_ENUM, struct basic_opts, load_mode, .short_opt = 'l', .metavar = "", .typearg = basic_load_modes, .help = "How to load (and store) data; default dpa"), DEFINE_OPTION("rxq-stats", OPT_BOOL, struct basic_opts, rxq_stats, .short_opt = 'r', .help = "Collect per-RXQ drop statistics"), DEFINE_OPTION("interval", OPT_U32, struct basic_opts, interval, .short_opt = 'i', .metavar = "", .help = "Polling interval (default 2)"), DEFINE_OPTION("extended", OPT_BOOL, struct basic_opts, extended, .short_opt = 'e', .help = "Start running in extended output mode (C^\\ to toggle)"), DEFINE_OPTION("xdp-mode", OPT_ENUM, struct basic_opts, mode, .short_opt = 'm', .typearg = xdp_modes, .metavar = "", .help = "Load XDP program in ; default native"), DEFINE_OPTION("dev", OPT_IFNAME, struct basic_opts, iface_in, .positional = true, .metavar = "", .required = true, .help = "Load on device "), END_OPTIONS }; struct prog_option redirect_basic_options[] = { DEFINE_OPTION("load-mode", OPT_ENUM, struct redirect_opts, load_mode, .short_opt = 'l', .metavar = "", .typearg = basic_load_modes, .help = "How to load (and store) data; default dpa"), DEFINE_OPTION("interval", OPT_U32, struct redirect_opts, interval, .short_opt = 'i', .metavar = "", .help = "Polling interval (default 2)"), DEFINE_OPTION("stats", OPT_BOOL, struct redirect_opts, stats, .short_opt = 's', .help = "Enable statistics for transmitted packets (not just errors)"), DEFINE_OPTION("extended", OPT_BOOL, struct redirect_opts, extended, .short_opt = 'e', .help = "Start running in extended output mode (C^\\ to toggle)"), DEFINE_OPTION("mode", OPT_ENUM, struct redirect_opts, mode, .short_opt = 'm', .typearg = xdp_modes, .metavar = "", .help = "Load XDP program in ; default native"), DEFINE_OPTION("dev_in", OPT_IFNAME, struct redirect_opts, iface_in, .positional = true, .metavar = "", .required = true, .help = "Redirect from device "), DEFINE_OPTION("dev_out", OPT_IFNAME, struct redirect_opts, iface_out, .positional = true, .metavar = "", .required = true, .help = "Redirect to device "), END_OPTIONS }; struct prog_option redirect_cpumap_options[] = { DEFINE_OPTION("cpu", OPT_U32_MULTI, struct cpumap_opts, cpus, .short_opt = 'c', .metavar = "", .required = true, .help = "Insert CPU into CPUMAP (can be specified multiple times)"), DEFINE_OPTION("dev", OPT_IFNAME, struct cpumap_opts, iface_in, .positional = true, .metavar = "", .required = true, .help = "Run on "), DEFINE_OPTION("program-mode", OPT_ENUM, struct cpumap_opts, program_mode, .short_opt = 'p', .metavar = "", .typearg = cpumap_program_modes, .help = "Redirect to CPUs using . Default l4-hash."), DEFINE_OPTION("remote-action", OPT_ENUM, struct cpumap_opts, remote_action, .short_opt = 'r', .metavar = "", .typearg = cpumap_remote_actions, .help = "Perform on the remote CPU. Default disabled."), DEFINE_OPTION("redirect-device", OPT_IFNAME, struct cpumap_opts, redir_iface, .short_opt = 'D', .metavar = "", .help = "Redirect packets to on remote CPU (when --remote-action is 'redirect')"), DEFINE_OPTION("qsize", OPT_U32, struct cpumap_opts, qsize, .short_opt = 'q', .metavar = "", .help = "CPUMAP queue size (default 2048)"), DEFINE_OPTION("stress-mode", OPT_BOOL, struct cpumap_opts, stress_mode, .short_opt = 'x', .help = "Stress the kernel CPUMAP setup and teardown code while running"), DEFINE_OPTION("interval", OPT_U32, struct cpumap_opts, interval, .short_opt = 'i', .metavar = "", .help = "Polling interval (default 2)"), DEFINE_OPTION("stats", OPT_BOOL, struct cpumap_opts, stats, .short_opt = 's', .help = "Enable statistics for transmitted packets (not just errors)"), DEFINE_OPTION("extended", OPT_BOOL, struct cpumap_opts, extended, .short_opt = 'e', .help = "Start running in extended output mode (C^\\ to toggle)"), DEFINE_OPTION("xdp-mode", OPT_ENUM, struct cpumap_opts, mode, .short_opt = 'm', .typearg = xdp_modes, .metavar = "", .help = "Load XDP program in ; default native"), END_OPTIONS }; struct prog_option redirect_devmap_options[] = { DEFINE_OPTION("load-egress", OPT_BOOL, struct devmap_opts, load_egress, .short_opt = 'X', .help = "Load an egress program into the devmap"), DEFINE_OPTION("egress-action", OPT_ENUM, struct devmap_opts, egress_action, .short_opt = 'A', .typearg = devmap_egress_actions, .metavar = "", .help = "Egress program . Default is forward"), DEFINE_OPTION("interval", OPT_U32, struct devmap_opts, interval, .short_opt = 'i', .metavar = "", .help = "Polling interval (default 2)"), DEFINE_OPTION("stats", OPT_BOOL, struct devmap_opts, stats, .short_opt = 's', .help = "Enable statistics for transmitted packets (not just errors)"), DEFINE_OPTION("extended", OPT_BOOL, struct devmap_opts, extended, .short_opt = 'e', .help = "Start running in extended output mode (C^\\ to toggle)"), DEFINE_OPTION("mode", OPT_ENUM, struct devmap_opts, mode, .short_opt = 'm', .typearg = xdp_modes, .metavar = "", .help = "Load XDP program in ; default native"), DEFINE_OPTION("dev_in", OPT_IFNAME, struct devmap_opts, iface_in, .positional = true, .metavar = "", .required = true, .help = "Redirect from device "), DEFINE_OPTION("dev_out", OPT_IFNAME, struct devmap_opts, iface_out, .positional = true, .metavar = "", .required = true, .help = "Redirect to device "), END_OPTIONS }; struct prog_option redirect_devmap_multi_options[] = { DEFINE_OPTION("load-egress", OPT_BOOL, struct devmap_multi_opts, load_egress, .short_opt = 'X', .help = "Load an egress program into the devmap"), DEFINE_OPTION("egress-action", OPT_ENUM, struct devmap_multi_opts, egress_action, .short_opt = 'A', .typearg = devmap_egress_actions, .metavar = "", .help = "Egress program . Default is forward"), DEFINE_OPTION("interval", OPT_U32, struct devmap_multi_opts, interval, .short_opt = 'i', .metavar = "", .help = "Polling interval (default 2)"), DEFINE_OPTION("stats", OPT_BOOL, struct devmap_multi_opts, stats, .short_opt = 's', .help = "Enable statistics for transmitted packets (not just errors)"), DEFINE_OPTION("extended", OPT_BOOL, struct devmap_multi_opts, extended, .short_opt = 'e', .help = "Start running in extended output mode (C^\\ to toggle)"), DEFINE_OPTION("mode", OPT_ENUM, struct devmap_multi_opts, mode, .short_opt = 'm', .typearg = xdp_modes, .metavar = "", .help = "Load XDP program in ; default native"), DEFINE_OPTION("devs", OPT_IFNAME_MULTI, struct devmap_multi_opts, ifaces, .positional = true, .metavar = "", .min_num = 2, .max_num = MAX_IFACE_NUM, .required = true, .help = "Redirect from and to devices "), END_OPTIONS }; struct prog_option xsk_options[] = { DEFINE_OPTION("queue", OPT_U32, struct xsk_opts, queue_idx, .short_opt = 'q', .metavar = "", .help = "Queue index to use (default 0)"), DEFINE_OPTION("interval", OPT_U32, struct xsk_opts, interval, .short_opt = 'i', .metavar = "", .help = "Statistics update interval (default 2)"), DEFINE_OPTION("retries", OPT_U32, struct xsk_opts, retries, .short_opt = 'O', .metavar = "", .help = "Number of time-out retries per 1s interval (default 3)"), DEFINE_OPTION("frame-size", OPT_U32, struct xsk_opts, frame_size, .short_opt = 'f', .metavar = "", .help = "Data frame size (must be a power of two in aligned mode); (default 4096)"), DEFINE_OPTION("duration", OPT_U32, struct xsk_opts, duration, .short_opt = 'd', .metavar = "", .help = "Duration to run; default 0 (forever)"), DEFINE_OPTION("batch-size", OPT_U32, struct xsk_opts, batch_size, .short_opt = 'b', .metavar = "", .help = "Batch size for receive loop; default 64"), DEFINE_OPTION("irq-string", OPT_STRING, struct xsk_opts, irq_string, .short_opt = 'I', .metavar = "", .help = "Display driver interrupt statistics for interface associated with "), DEFINE_OPTION("poll", OPT_BOOL, struct xsk_opts, use_poll, .short_opt = 'p', .help = "Use poll syscall"), DEFINE_OPTION("no-need-wakeup", OPT_BOOL, struct xsk_opts, no_need_wakeup, .short_opt = 'm', .help = "Turn off use of driver need wakeup flag"), DEFINE_OPTION("unaligned", OPT_BOOL, struct xsk_opts, unaligned, .short_opt = 'u', .help = "Enable unaligned chunk placement"), DEFINE_OPTION("shared-umem", OPT_BOOL, struct xsk_opts, shared_umem, .short_opt = 'M', .help = "Enable XDP_SHARED_UMEM across multiple sockets"), DEFINE_OPTION("extra-stats", OPT_BOOL, struct xsk_opts, extra_stats, .short_opt = 'x', .help = "Display extra statistics"), DEFINE_OPTION("quiet", OPT_BOOL, struct xsk_opts, quiet, .short_opt = 'Q', .help = "Do not display any stats"), DEFINE_OPTION("app-stats", OPT_BOOL, struct xsk_opts, app_stats, .short_opt = 'a', .help = "Display application (syscall) statistics"), DEFINE_OPTION("busy-poll", OPT_BOOL, struct xsk_opts, busy_poll, .short_opt = 'B', .help = "Enable busy polling"), DEFINE_OPTION("frags", OPT_BOOL, struct xsk_opts, frags, .short_opt = 'F', .help = "Enable frags (multi-buffer) support"), DEFINE_OPTION("copy_mode", OPT_ENUM, struct xsk_opts, copy_mode, .short_opt = 'C', .typearg = xsk_copy_modes, .metavar = "", .help = "Use for copying data packets to userspace; default auto"), DEFINE_OPTION("clock", OPT_ENUM, struct xsk_opts, clock, .short_opt = 'w', .typearg = xsk_clocks, .metavar = "", .help = "Clock name to use; default MONOTONIC"), DEFINE_OPTION("policy", OPT_ENUM, struct xsk_opts, sched_policy, .short_opt = 'W', .typearg = xsk_sched_policies, .metavar = "", .help = "Scheduler policy; default SCHED_OTHER"), DEFINE_OPTION("schpri", OPT_U32, struct xsk_opts, sched_prio, .short_opt = 'U', .metavar = "", .help = "Scheduler priority; default 0"), DEFINE_OPTION("attach-mode", OPT_ENUM, struct xsk_opts, attach_mode, .short_opt = 'A', .typearg = xdp_modes, .metavar = "", .help = "Load XDP program in ; default native"), DEFINE_OPTION("dev", OPT_IFNAME, struct xsk_opts, iface, .positional = true, .metavar = "", .required = true, .help = "Load on device "), END_OPTIONS }; static const struct prog_command cmds[] = { { .name = "drop", .func = do_drop, .options = basic_options, .default_cfg = &defaults_drop, .doc = "Drop all packets on an interface" }, { .name = "pass", .func = do_pass, .options = basic_options, .default_cfg = &defaults_pass, .doc = "Pass all packets to the network stack" }, { .name = "tx", .func = do_tx, .options = basic_options, .default_cfg = &defaults_tx, .doc = "Transmit packets back out an interface (hairpin forwarding)" }, DEFINE_COMMAND_NAME("redirect", redirect_basic, "XDP redirect using the bpf_redirect() helper"), DEFINE_COMMAND_NAME("redirect-cpu", redirect_cpumap, "XDP CPU redirect using BPF_MAP_TYPE_CPUMAP"), DEFINE_COMMAND_NAME("redirect-map", redirect_devmap, "XDP redirect using BPF_MAP_TYPE_DEVMAP"), DEFINE_COMMAND_NAME( "redirect-multi", redirect_devmap_multi, "XDP multi-redirect using BPF_MAP_TYPE_DEVMAP and the BPF_F_BROADCAST flag"), { .name = "xsk-drop", .func = do_xsk_drop, .options = xsk_options, .default_cfg = &defaults_xsk, .doc = "AF_XDP-based packet drop" }, { .name = "xsk-tx", .func = do_xsk_tx, .options = xsk_options, .default_cfg = &defaults_xsk, .doc = "AF_XDP-based transmit back out an interface (hairpin forwarding)" }, { .name = "help", .func = do_help, .no_cfg = true }, END_COMMANDS }; union all_opts { struct basic_opts basic; struct cpumap_opts cpumap; struct devmap_opts devmap; struct devmap_multi_opts devmap_multi; struct xsk_opts xsk; }; int main(int argc, char **argv) { if (argc > 1) return dispatch_commands(argv[1], argc - 1, argv + 1, cmds, sizeof(union all_opts), PROG_NAME, false); return do_help(NULL, NULL); } xdp-tools-1.6.1/xdp-bench/xdp-bench.h000066400000000000000000000054231514310632100173410ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0-only #ifndef XDP_REDIRECT_H #define XDP_REDIRECT_H #include #include "params.h" #include "util.h" #include "xdpsock.h" #define MAX_IFACE_NUM 32 int do_drop(const void *cfg, const char *pin_root_path); int do_pass(const void *cfg, const char *pin_root_path); int do_tx(const void *cfg, const char *pin_root_path); int do_redirect_basic(const void *cfg, const char *pin_root_path); int do_redirect_cpumap(const void *cfg, const char *pin_root_path); int do_redirect_devmap(const void *cfg, const char *pin_root_path); int do_redirect_devmap_multi(const void *cfg, const char *pin_root_path); int do_xsk_drop(const void *cfg, const char *pin_root_path); int do_xsk_tx(const void *cfg, const char *pin_root_path); enum basic_program_mode { BASIC_NO_TOUCH, BASIC_READ_DATA, BASIC_PARSE_IPHDR, BASIC_SWAP_MACS, }; enum basic_load_mode { BASIC_LOAD_DPA, BASIC_LOAD_BYTES, }; struct basic_opts { bool extended; bool rxq_stats; __u32 interval; enum xdp_attach_mode mode; enum basic_program_mode program_mode; enum basic_load_mode load_mode; struct iface iface_in; }; struct redirect_opts { bool stats; bool extended; __u32 interval; enum xdp_attach_mode mode; enum basic_load_mode load_mode; struct iface iface_in; struct iface iface_out; }; enum devmap_egress_action { DEVMAP_EGRESS_NONE, DEVMAP_EGRESS_FORWARD, DEVMAP_EGRESS_DROP, }; struct devmap_opts { bool stats; bool extended; bool load_egress; enum devmap_egress_action egress_action; __u32 interval; enum xdp_attach_mode mode; struct iface iface_in; struct iface iface_out; }; struct devmap_multi_opts { bool stats; bool extended; bool load_egress; enum devmap_egress_action egress_action; __u32 interval; enum xdp_attach_mode mode; struct iface *ifaces; }; enum cpumap_remote_action { ACTION_DISABLED, ACTION_DROP, ACTION_PASS, ACTION_REDIRECT, }; enum cpumap_program_mode { CPUMAP_NO_TOUCH, CPUMAP_TOUCH_DATA, CPUMAP_CPU_ROUND_ROBIN, CPUMAP_CPU_L4_PROTO, CPUMAP_CPU_L4_PROTO_FILTER, CPUMAP_CPU_L4_HASH, CPUMAP_CPU_L4_SPORT, CPUMAP_CPU_L4_DPORT, }; struct cpumap_opts { bool stats; bool extended; bool stress_mode; __u32 interval; __u32 qsize; struct u32_multi cpus; enum xdp_attach_mode mode; enum cpumap_remote_action remote_action; enum cpumap_program_mode program_mode; struct iface iface_in; struct iface redir_iface; }; extern const struct basic_opts defaults_drop; extern const struct basic_opts defaults_pass; extern const struct basic_opts defaults_tx; extern const struct redirect_opts defaults_redirect_basic; extern const struct cpumap_opts defaults_redirect_cpumap; extern const struct devmap_opts defaults_redirect_devmap; extern const struct devmap_multi_opts defaults_redirect_devmap_multi; extern const struct xsk_opts defaults_xsk; #endif xdp-tools-1.6.1/xdp-bench/xdp_basic.bpf.c000066400000000000000000000124321514310632100201640ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2016 John Fastabend * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ #include #include #include #include #include #include #ifndef HAVE_LIBBPF_BPF_PROGRAM__TYPE static long (*bpf_xdp_load_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) = (void *) 189; static long (*bpf_xdp_store_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) = (void *) 190; #endif const volatile bool rxq_stats = 0; const volatile enum xdp_action action = XDP_DROP; static int parse_ip_header_load(struct xdp_md *ctx) { int eth_type, ip_type, err, offset = 0; struct ipv6hdr ipv6hdr; struct iphdr iphdr; struct ethhdr eth; err = bpf_xdp_load_bytes(ctx, offset, ð, sizeof(eth)); if (err) return err; eth_type = eth.h_proto; offset = sizeof(eth); if (eth_type == bpf_htons(ETH_P_IP)) { err = bpf_xdp_load_bytes(ctx, offset, &iphdr, sizeof(iphdr)); if (err) return err; ip_type = iphdr.protocol; if (ip_type < 0) return ip_type; } else if (eth_type == bpf_htons(ETH_P_IPV6)) { err = bpf_xdp_load_bytes(ctx, offset, &ipv6hdr, sizeof(ipv6hdr)); if (err) return err; ip_type = ipv6hdr.nexthdr; if (ip_type < 0) return ip_type; } return 0; } static int parse_ip_header(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct hdr_cursor nh = { .pos = data }; struct ipv6hdr *ipv6hdr; struct iphdr *iphdr; struct ethhdr *eth; int eth_type, ip_type; eth_type = parse_ethhdr(&nh, data_end, ð); if (eth_type < 0) return eth_type; if (eth_type == bpf_htons(ETH_P_IP)) { ip_type = parse_iphdr(&nh, data_end, &iphdr); if (ip_type < 0) return ip_type; } else if (eth_type == bpf_htons(ETH_P_IPV6)) { ip_type = parse_ip6hdr(&nh, data_end, &ipv6hdr); if (ip_type < 0) return ip_type; } return 0; } static int record_stats(__u32 rxq_idx, bool success) { __u32 key = bpf_get_smp_processor_id(); struct datarec *rec; rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) return -1; NO_TEAR_INC(rec->processed); if (action == XDP_DROP && success) NO_TEAR_INC(rec->dropped); if (rxq_stats) { struct datarec *rxq_rec; rxq_rec = bpf_map_lookup_elem(&rxq_cnt, &rxq_idx); if (!rxq_rec) return -1; NO_TEAR_INC(rxq_rec->processed); if (action == XDP_DROP && success) NO_TEAR_INC(rxq_rec->dropped); } return 0; } SEC("xdp") int xdp_basic_prog(struct xdp_md *ctx) { if (record_stats(ctx->rx_queue_index, true)) return XDP_ABORTED; return action; } SEC("xdp") int xdp_read_data_prog(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct ethhdr *eth = data; int ret = action; __u64 nh_off; nh_off = sizeof(*eth); if (data + nh_off > data_end) return XDP_ABORTED; if (bpf_ntohs(eth->h_proto) < ETH_P_802_3_MIN) ret = XDP_ABORTED; if (record_stats(ctx->rx_queue_index, ret==action)) return XDP_ABORTED; return ret; } SEC("xdp") int xdp_read_data_load_bytes_prog(struct xdp_md *ctx) { int err, offset = 0; struct ethhdr eth; int ret = action; err = bpf_xdp_load_bytes(ctx, offset, ð, sizeof(eth)); if (err) return err; if (bpf_ntohs(eth.h_proto) < ETH_P_802_3_MIN) ret = XDP_ABORTED; if (record_stats(ctx->rx_queue_index, ret==action)) return XDP_ABORTED; return ret; } SEC("xdp") int xdp_swap_macs_prog(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct ethhdr *eth = data; __u64 nh_off; nh_off = sizeof(*eth); if (data + nh_off > data_end) return XDP_ABORTED; swap_src_dst_mac(data); if (record_stats(ctx->rx_queue_index, true)) return XDP_ABORTED; return action; } SEC("xdp") int xdp_swap_macs_load_bytes_prog(struct xdp_md *ctx) { int err, offset = 0; struct ethhdr eth; err = bpf_xdp_load_bytes(ctx, offset, ð, sizeof(eth)); if (err) return err; swap_src_dst_mac(ð); err = bpf_xdp_store_bytes(ctx, offset, ð, sizeof(eth)); if (err) return err; if (record_stats(ctx->rx_queue_index, true)) return XDP_ABORTED; return action; } SEC("xdp") int xdp_parse_prog(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct ethhdr *eth = data; int ret = action; __u64 nh_off; nh_off = sizeof(*eth); if (data + nh_off > data_end) return XDP_ABORTED; if (parse_ip_header(ctx)) ret = XDP_ABORTED; if (record_stats(ctx->rx_queue_index, ret==action)) return XDP_ABORTED; return ret; } SEC("xdp") int xdp_parse_load_bytes_prog(struct xdp_md *ctx) { int ret = action; if (parse_ip_header_load(ctx)) ret = XDP_ABORTED; if (record_stats(ctx->rx_queue_index, ret==action)) return XDP_ABORTED; return ret; } char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/xdp-bench/xdp_basic.c000066400000000000000000000113221514310632100174130ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 John Fastabend */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "logging.h" #include "xdp-bench.h" #include "xdp_sample.h" #include "xdp_basic.skel.h" static int mask = SAMPLE_RX_CNT | SAMPLE_EXCEPTION_CNT; DEFINE_SAMPLE_INIT(xdp_basic); const struct basic_opts defaults_drop = { .mode = XDP_MODE_NATIVE, .interval = 2 }; const struct basic_opts defaults_pass = { .mode = XDP_MODE_NATIVE, .interval = 2 }; const struct basic_opts defaults_tx = { .mode = XDP_MODE_NATIVE, .interval = 2, .program_mode = BASIC_SWAP_MACS }; static int do_basic(const struct basic_opts *opt, enum xdp_action action) { DECLARE_LIBBPF_OPTS(xdp_program_opts, opts); struct xdp_program *xdp_prog = NULL; struct bpf_program *prog = NULL; int ret = EXIT_FAIL_OPTION; struct xdp_basic *skel; if (opt->extended) sample_switch_mode(); skel = xdp_basic__open(); if (!skel) { pr_warn("Failed to xdp_basic__open: %s\n", strerror(errno)); ret = EXIT_FAIL_BPF; goto end; } ret = sample_init_pre_load(skel, opt->iface_in.ifname); if (ret < 0) { pr_warn("Failed to sample_init_pre_load: %s\n", strerror(-ret)); ret = EXIT_FAIL_BPF; goto end_destroy; } skel->rodata->action = action; if (action == XDP_DROP) mask |= SAMPLE_DROP_OK; if (opt->rxq_stats) { skel->rodata->rxq_stats = true; mask |= SAMPLE_RXQ_STATS; } /* Make sure we only load the one XDP program we are interested in */ while ((prog = bpf_object__next_program(skel->obj, prog)) != NULL) if (bpf_program__type(prog) == BPF_PROG_TYPE_XDP && bpf_program__expected_attach_type(prog) == BPF_XDP) bpf_program__set_autoload(prog, false); switch (opt->program_mode) { case BASIC_NO_TOUCH: opts.prog_name = "xdp_basic_prog"; break; case BASIC_READ_DATA: opts.prog_name = (opt->load_mode == BASIC_LOAD_BYTES) ? "xdp_read_data_load_bytes_prog" : "xdp_read_data_prog"; break; case BASIC_PARSE_IPHDR: opts.prog_name = (opt->load_mode == BASIC_LOAD_BYTES) ? "xdp_parse_load_bytes_prog" : "xdp_parse_prog"; break; case BASIC_SWAP_MACS: opts.prog_name = (opt->load_mode == BASIC_LOAD_BYTES) ? "xdp_swap_macs_load_bytes_prog" : "xdp_swap_macs_prog"; break; } opts.obj = skel->obj; xdp_prog = xdp_program__create(&opts); if (!xdp_prog) { ret = -errno; pr_warn("Couldn't open XDP program: %s\n", strerror(-ret)); goto end_destroy; } /* We always set the frags support bit: nothing the program does is * incompatible with multibuf, and it's perfectly fine to load a program * with frags support on an interface with a small MTU. We don't risk * setting any flags the kernel will balk at, either, since libxdp will * do the feature probing for us and skip the flag if the kernel doesn't * support it. * * The function below returns EOPNOTSUPP it libbpf is too old to support * setting the flags, but we just ignore that, since in such a case the * best we can do is just attempt to run without the frags support. */ xdp_program__set_xdp_frags_support(xdp_prog, true); ret = xdp_program__attach(xdp_prog, opt->iface_in.ifindex, opt->mode, 0); if (ret < 0) { pr_warn("Failed to attach XDP program: %s\n", strerror(-ret)); ret = EXIT_FAIL_BPF; goto end_destroy; } ret = sample_init(skel, mask, 0, 0); if (ret < 0) { pr_warn("Failed to initialize sample: %s\n", strerror(-ret)); ret = EXIT_FAIL; goto end_detach; } ret = EXIT_FAIL; pr_info("%s packets on %s (ifindex %d; driver %s)\n", action == XDP_DROP ? "Dropping" : action == XDP_TX ? "Hairpinning (XDP_TX)" : "Passing", opt->iface_in.ifname, opt->iface_in.ifindex, get_driver_name(opt->iface_in.ifindex)); ret = sample_run(opt->interval, NULL, NULL); if (ret < 0) { pr_warn("Failed during sample run: %s\n", strerror(-ret)); ret = EXIT_FAIL; goto end_detach; } ret = EXIT_OK; end_detach: xdp_program__detach(xdp_prog, opt->iface_in.ifindex, opt->mode, 0); end_destroy: xdp_basic__destroy(skel); end: sample_teardown(); return ret; } int do_drop(const void *cfg, __unused const char *pin_root_path) { const struct basic_opts *opt = cfg; return do_basic(opt, XDP_DROP); } int do_pass(const void *cfg, __unused const char *pin_root_path) { const struct basic_opts *opt = cfg; return do_basic(opt, XDP_PASS); } int do_tx(const void *cfg, __unused const char *pin_root_path) { const struct basic_opts *opt = cfg; return do_basic(opt, XDP_TX); } xdp-tools-1.6.1/xdp-bench/xdp_redirect_basic.bpf.c000066400000000000000000000040021514310632100220370ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2016 John Fastabend * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ #include #include #include #include #include #ifndef HAVE_LIBBPF_BPF_PROGRAM__TYPE static long (*bpf_xdp_load_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) = (void *) 189; static long (*bpf_xdp_store_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) = (void *) 190; #endif const volatile int ifindex_out; SEC("xdp") int xdp_redirect_prog(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; __u32 key = bpf_get_smp_processor_id(); struct ethhdr *eth = data; struct datarec *rec; __u64 nh_off; nh_off = sizeof(*eth); if (data + nh_off > data_end) return XDP_DROP; rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) return XDP_PASS; NO_TEAR_INC(rec->processed); swap_src_dst_mac(data); return bpf_redirect(ifindex_out, 0); } SEC("xdp") int xdp_redirect_load_bytes_prog(struct xdp_md *ctx) { __u32 key = bpf_get_smp_processor_id(); int err, offset = 0; struct datarec *rec; struct ethhdr eth; err = bpf_xdp_load_bytes(ctx, offset, ð, sizeof(eth)); if (err) return err; rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) return XDP_PASS; NO_TEAR_INC(rec->processed); swap_src_dst_mac(ð); err = bpf_xdp_store_bytes(ctx, offset, ð, sizeof(eth)); if (err) return err; return bpf_redirect(ifindex_out, 0); } char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/xdp-bench/xdp_redirect_basic.c000066400000000000000000000114441514310632100213010ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 John Fastabend */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "logging.h" #include "xdp-bench.h" #include "xdp_sample.h" #include "xdp_redirect_basic.skel.h" static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_CNT | SAMPLE_EXCEPTION_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI; DEFINE_SAMPLE_INIT(xdp_redirect_basic); const struct redirect_opts defaults_redirect_basic = { .mode = XDP_MODE_NATIVE, .interval = 2 }; int do_redirect_basic(const void *cfg, __unused const char *pin_root_path) { const struct redirect_opts *opt = cfg; struct xdp_program *xdp_prog = NULL, *dummy_prog = NULL; DECLARE_LIBBPF_OPTS(xdp_program_opts, opts); struct bpf_program *prog = NULL; struct xdp_redirect_basic *skel; char str[2 * IF_NAMESIZE + 1]; int ret = EXIT_FAIL_OPTION; if (opt->extended) sample_switch_mode(); if (opt->mode == XDP_MODE_SKB) /* devmap_xmit tracepoint not available */ mask &= ~(SAMPLE_DEVMAP_XMIT_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI); if (opt->stats) mask |= SAMPLE_REDIRECT_CNT; skel = xdp_redirect_basic__open(); if (!skel) { pr_warn("Failed to xdp_redirect_basic__open: %s\n", strerror(errno)); ret = EXIT_FAIL_BPF; goto end; } ret = sample_init_pre_load(skel, opt->iface_in.ifname); if (ret < 0) { pr_warn("Failed to sample_init_pre_load: %s\n", strerror(-ret)); ret = EXIT_FAIL_BPF; goto end_destroy; } skel->rodata->from_match[0] = opt->iface_in.ifindex; skel->rodata->to_match[0] = opt->iface_out.ifindex; skel->rodata->ifindex_out = opt->iface_out.ifindex; /* Make sure we only load the one XDP program we are interested in */ while ((prog = bpf_object__next_program(skel->obj, prog)) != NULL) if (bpf_program__type(prog) == BPF_PROG_TYPE_XDP && bpf_program__expected_attach_type(prog) == BPF_XDP) bpf_program__set_autoload(prog, false); opts.obj = skel->obj; opts.prog_name = (opt->load_mode == BASIC_LOAD_BYTES) ? "xdp_redirect_load_bytes_prog" : "xdp_redirect_prog"; xdp_prog = xdp_program__create(&opts); if (!xdp_prog) { ret = -errno; pr_warn("Couldn't open XDP program: %s\n", strerror(-ret)); goto end_destroy; } /* We always set the frags support bit: nothing the program does is * incompatible with multibuf, and it's perfectly fine to load a program * with frags support on an interface with a small MTU. We don't risk * setting any flags the kernel will balk at, either, since libxdp will * do the feature probing for us and skip the flag if the kernel doesn't * support it. * * The function below returns EOPNOTSUPP it libbpf is too old to support * setting the flags, but we just ignore that, since in such a case the * best we can do is just attempt to run without the frags support. */ xdp_program__set_xdp_frags_support(xdp_prog, true); ret = xdp_program__attach(xdp_prog, opt->iface_in.ifindex, opt->mode, 0); if (ret < 0) { pr_warn("Failed to attach XDP program: %s\n", strerror(-ret)); ret = EXIT_FAIL_BPF; goto end_destroy; } ret = sample_init(skel, mask, opt->iface_in.ifindex, opt->iface_out.ifindex); if (ret < 0) { pr_warn("Failed to initialize sample: %s\n", strerror(-ret)); ret = EXIT_FAIL; goto end_detach; } opts.obj = NULL; opts.prog_name = "xdp_pass"; opts.find_filename = "xdp-dispatcher.o"; dummy_prog = xdp_program__create(&opts); if (!dummy_prog) { pr_warn("Failed to load dummy program: %s\n", strerror(errno)); ret = EXIT_FAIL_BPF; goto end_detach; } xdp_program__set_xdp_frags_support(dummy_prog, true); ret = xdp_program__attach(dummy_prog, opt->iface_out.ifindex, opt->mode, 0); if (ret < 0) { pr_warn("Failed to attach dummy program: %s\n", strerror(-ret)); ret = EXIT_FAIL_BPF; goto end_detach; } ret = EXIT_FAIL; safe_strncpy(str, get_driver_name(opt->iface_in.ifindex), sizeof(str)); pr_info("Redirecting from %s (ifindex %d; driver %s) to %s (ifindex %d; driver %s)\n", opt->iface_in.ifname, opt->iface_in.ifindex, str, opt->iface_out.ifname, opt->iface_out.ifindex, get_driver_name(opt->iface_out.ifindex)); ret = sample_run(opt->interval, NULL, NULL); if (ret < 0) { pr_warn("Failed during sample run: %s\n", strerror(-ret)); ret = EXIT_FAIL; goto end_detach; } ret = EXIT_OK; end_detach: if (dummy_prog) xdp_program__detach(dummy_prog, opt->iface_out.ifindex, opt->mode, 0); xdp_program__detach(xdp_prog, opt->iface_in.ifindex, opt->mode, 0); end_destroy: xdp_redirect_basic__destroy(skel); end: sample_teardown(); return ret; } xdp-tools-1.6.1/xdp-bench/xdp_redirect_cpumap.bpf.c000066400000000000000000000364301514310632100222550ustar00rootroot00000000000000/* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP) * * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ #include #include #include #include #include #include "hash_func01.h" /* Special map type that can XDP_REDIRECT frames to another CPU */ struct { __uint(type, BPF_MAP_TYPE_CPUMAP); __uint(key_size, sizeof(__u32)); __uint(value_size, sizeof(struct bpf_cpumap_val)); } cpu_map SEC(".maps"); /* Set of maps controlling available CPU, and for iterating through * selectable redirect CPUs. */ struct { __uint(type, BPF_MAP_TYPE_ARRAY); __type(key, __u32); __type(value, __u32); } cpus_available SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_ARRAY); __type(key, __u32); __type(value, __u32); __uint(max_entries, 1); } cpus_count SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __type(key, __u32); __type(value, __u32); __uint(max_entries, 1); } cpus_iterator SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_DEVMAP); __uint(key_size, sizeof(int)); __uint(value_size, sizeof(struct bpf_devmap_val)); __uint(max_entries, 1); } tx_port SEC(".maps"); char tx_mac_addr[ETH_ALEN]; /* Helper parse functions */ static __always_inline bool parse_eth(struct ethhdr *eth, void *data_end, __u16 *eth_proto, __u64 *l3_offset) { __u16 eth_type; __u64 offset; offset = sizeof(*eth); if ((void *)eth + offset > data_end) return false; eth_type = eth->h_proto; /* Skip non 802.3 Ethertypes */ if (__builtin_expect(bpf_ntohs(eth_type) < ETH_P_802_3_MIN, 0)) return false; /* Handle VLAN tagged packet */ if (eth_type == bpf_htons(ETH_P_8021Q) || eth_type == bpf_htons(ETH_P_8021AD)) { struct vlan_hdr *vlan_hdr; vlan_hdr = (void *)eth + offset; offset += sizeof(*vlan_hdr); if ((void *)eth + offset > data_end) return false; eth_type = vlan_hdr->h_vlan_encapsulated_proto; } /* Handle double VLAN tagged packet */ if (eth_type == bpf_htons(ETH_P_8021Q) || eth_type == bpf_htons(ETH_P_8021AD)) { struct vlan_hdr *vlan_hdr; vlan_hdr = (void *)eth + offset; offset += sizeof(*vlan_hdr); if ((void *)eth + offset > data_end) return false; eth_type = vlan_hdr->h_vlan_encapsulated_proto; } *eth_proto = bpf_ntohs(eth_type); *l3_offset = offset; return true; } static __always_inline __u16 get_port_ipv4_udp(struct xdp_md *ctx, __u64 nh_off, bool src) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct iphdr *iph = data + nh_off; struct udphdr *udph; if (iph + 1 > data_end) return 0; if (!(iph->protocol == IPPROTO_UDP)) return 0; udph = (void *)(iph + 1); if (udph + 1 > data_end) return 0; if (src) return bpf_ntohs(udph->source); else return bpf_ntohs(udph->dest); } static __always_inline __u16 get_port_ipv6_udp(struct xdp_md *ctx, __u64 nh_off, bool src) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct ipv6hdr *ip6h = data + nh_off; struct udphdr *udph; if (ip6h + 1 > data_end) return 0; if (!(ip6h->nexthdr == IPPROTO_UDP)) return 0; udph = (void *)(ip6h + 1); if (udph + 1 > data_end) return 0; if (src) return bpf_ntohs(udph->source); else return bpf_ntohs(udph->dest); } static __always_inline __u16 get_port_ipv4_tcp(struct xdp_md *ctx, __u64 nh_off, bool src) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct iphdr *iph = data + nh_off; struct tcphdr *tcph; if (iph + 1 > data_end) return 0; if (!(iph->protocol == IPPROTO_TCP)) return 0; tcph = (void *)(iph + 1); if (tcph + 1 > data_end) return 0; if (src) return bpf_ntohs(tcph->source); else return bpf_ntohs(tcph->dest); } static __always_inline __u16 get_port_ipv6_tcp(struct xdp_md *ctx, __u64 nh_off, bool src) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct ipv6hdr *ip6h = data + nh_off; struct tcphdr *tcph; if (ip6h + 1 > data_end) return 0; if (!(ip6h->nexthdr == IPPROTO_UDP)) return 0; tcph = (void *)(ip6h + 1); if (tcph + 1 > data_end) return 0; if (src) return bpf_ntohs(tcph->source); else return bpf_ntohs(tcph->dest); } static __always_inline int get_proto_ipv4(struct xdp_md *ctx, __u64 nh_off) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct iphdr *iph = data + nh_off; if (iph + 1 > data_end) return 0; return iph->protocol; } static __always_inline int get_proto_ipv6(struct xdp_md *ctx, __u64 nh_off) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct ipv6hdr *ip6h = data + nh_off; if (ip6h + 1 > data_end) return 0; return ip6h->nexthdr; } SEC("xdp") int cpumap_no_touch(struct xdp_md *ctx) { __u32 key = bpf_get_smp_processor_id(); struct datarec *rec; __u32 *cpu_selected; __u32 cpu_dest = 0; __u32 key0 = 0; /* Only use first entry in cpus_available */ cpu_selected = bpf_map_lookup_elem(&cpus_available, &key0); if (!cpu_selected) return XDP_ABORTED; cpu_dest = *cpu_selected; rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) return XDP_PASS; NO_TEAR_INC(rec->processed); if (cpu_dest >= nr_cpus) { NO_TEAR_INC(rec->issue); return XDP_ABORTED; } return bpf_redirect_map(&cpu_map, cpu_dest, 0); } SEC("xdp") int cpumap_touch_data(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; __u32 key = bpf_get_smp_processor_id(); struct ethhdr *eth = data; struct datarec *rec; __u32 *cpu_selected; __u32 cpu_dest = 0; __u32 key0 = 0; __u16 eth_type; /* Only use first entry in cpus_available */ cpu_selected = bpf_map_lookup_elem(&cpus_available, &key0); if (!cpu_selected) return XDP_ABORTED; cpu_dest = *cpu_selected; /* Validate packet length is minimum Eth header size */ if (eth + 1 > data_end) return XDP_ABORTED; rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) return XDP_PASS; NO_TEAR_INC(rec->processed); /* Read packet data, and use it (drop non 802.3 Ethertypes) */ eth_type = eth->h_proto; if (bpf_ntohs(eth_type) < ETH_P_802_3_MIN) { NO_TEAR_INC(rec->dropped); return XDP_DROP; } if (cpu_dest >= nr_cpus) { NO_TEAR_INC(rec->issue); return XDP_ABORTED; } return bpf_redirect_map(&cpu_map, cpu_dest, 0); } SEC("xdp") int cpumap_round_robin(struct xdp_md *ctx) { __u32 key = bpf_get_smp_processor_id(); struct datarec *rec; __u32 cpu_dest = 0; __u32 key0 = 0; __u32 *cpu_selected; __u32 *cpu_iterator; __u32 *cpu_max; __u32 cpu_idx; cpu_max = bpf_map_lookup_elem(&cpus_count, &key0); if (!cpu_max) return XDP_ABORTED; cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0); if (!cpu_iterator) return XDP_ABORTED; cpu_idx = *cpu_iterator; *cpu_iterator += 1; if (*cpu_iterator == *cpu_max) *cpu_iterator = 0; cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx); if (!cpu_selected) return XDP_ABORTED; cpu_dest = *cpu_selected; rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) return XDP_PASS; NO_TEAR_INC(rec->processed); if (cpu_dest >= nr_cpus) { NO_TEAR_INC(rec->issue); return XDP_ABORTED; } return bpf_redirect_map(&cpu_map, cpu_dest, 0); } SEC("xdp") int cpumap_l4_proto(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; __u32 key = bpf_get_smp_processor_id(); struct ethhdr *eth = data; __u8 ip_proto = IPPROTO_UDP; struct datarec *rec; __u16 eth_proto = 0; __u64 l3_offset = 0; __u32 cpu_dest = 0; __u32 *cpu_lookup; __u32 cpu_idx = 0; rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) return XDP_PASS; NO_TEAR_INC(rec->processed); if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) return XDP_PASS; /* Just skip */ /* Extract L4 protocol */ switch (eth_proto) { case ETH_P_IP: ip_proto = get_proto_ipv4(ctx, l3_offset); break; case ETH_P_IPV6: ip_proto = get_proto_ipv6(ctx, l3_offset); break; case ETH_P_ARP: cpu_idx = 0; /* ARP packet handled on separate CPU */ break; default: cpu_idx = 0; } /* Choose CPU based on L4 protocol */ switch (ip_proto) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: cpu_idx = 2; break; case IPPROTO_TCP: cpu_idx = 0; break; case IPPROTO_UDP: cpu_idx = 1; break; default: cpu_idx = 0; } cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); if (!cpu_lookup) return XDP_ABORTED; cpu_dest = *cpu_lookup; if (cpu_dest >= nr_cpus) { NO_TEAR_INC(rec->issue); return XDP_ABORTED; } return bpf_redirect_map(&cpu_map, cpu_dest, 0); } SEC("xdp") int cpumap_l4_filter(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; __u32 key = bpf_get_smp_processor_id(); struct ethhdr *eth = data; __u8 ip_proto = IPPROTO_UDP; struct datarec *rec; __u16 eth_proto = 0; __u64 l3_offset = 0; __u32 cpu_dest = 0; __u32 *cpu_lookup; __u32 cpu_idx = 0; __u16 dest_port; rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) return XDP_PASS; NO_TEAR_INC(rec->processed); if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) return XDP_PASS; /* Just skip */ /* Extract L4 protocol */ switch (eth_proto) { case ETH_P_IP: ip_proto = get_proto_ipv4(ctx, l3_offset); break; case ETH_P_IPV6: ip_proto = get_proto_ipv6(ctx, l3_offset); break; case ETH_P_ARP: cpu_idx = 0; /* ARP packet handled on separate CPU */ break; default: cpu_idx = 0; } /* Choose CPU based on L4 protocol */ switch (ip_proto) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: cpu_idx = 2; break; case IPPROTO_TCP: cpu_idx = 0; break; case IPPROTO_UDP: cpu_idx = 1; /* DDoS filter UDP port 9 (pktgen) */ dest_port = get_port_ipv4_udp(ctx, l3_offset, false); if (dest_port == 9) { NO_TEAR_INC(rec->dropped); return XDP_DROP; } break; default: cpu_idx = 0; } cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); if (!cpu_lookup) return XDP_ABORTED; cpu_dest = *cpu_lookup; if (cpu_dest >= nr_cpus) { NO_TEAR_INC(rec->issue); return XDP_ABORTED; } return bpf_redirect_map(&cpu_map, cpu_dest, 0); } /* Hashing initval */ #define INITVAL 15485863 static __always_inline __u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, __u64 nh_off) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct iphdr *iph = data + nh_off; __u32 cpu_hash; if (iph + 1 > data_end) return 0; cpu_hash = iph->saddr + iph->daddr; cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol); return cpu_hash; } static __always_inline __u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, __u64 nh_off) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct ipv6hdr *ip6h = data + nh_off; __u32 cpu_hash; if (ip6h + 1 > data_end) return 0; cpu_hash = ip6h->saddr.in6_u.u6_addr32[0] + ip6h->daddr.in6_u.u6_addr32[0]; cpu_hash += ip6h->saddr.in6_u.u6_addr32[1] + ip6h->daddr.in6_u.u6_addr32[1]; cpu_hash += ip6h->saddr.in6_u.u6_addr32[2] + ip6h->daddr.in6_u.u6_addr32[2]; cpu_hash += ip6h->saddr.in6_u.u6_addr32[3] + ip6h->daddr.in6_u.u6_addr32[3]; cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr); return cpu_hash; } /* Load-Balance traffic based on hashing IP-addrs + L4-proto. The * hashing scheme is symmetric, meaning swapping IP src/dest still hit * same CPU. */ SEC("xdp") int cpumap_l4_hash(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; __u32 key = bpf_get_smp_processor_id(); struct ethhdr *eth = data; struct datarec *rec; __u16 eth_proto = 0; __u64 l3_offset = 0; __u32 cpu_dest = 0; __u32 cpu_idx = 0; __u32 *cpu_lookup; __u32 key0 = 0; __u32 *cpu_max; __u32 cpu_hash; rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) return XDP_PASS; NO_TEAR_INC(rec->processed); cpu_max = bpf_map_lookup_elem(&cpus_count, &key0); if (!cpu_max) return XDP_ABORTED; if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) return XDP_PASS; /* Just skip */ /* Hash for IPv4 and IPv6 */ switch (eth_proto) { case ETH_P_IP: cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset); break; case ETH_P_IPV6: cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset); break; case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */ default: cpu_hash = 0; } /* Choose CPU based on hash */ cpu_idx = cpu_hash % *cpu_max; cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); if (!cpu_lookup) return XDP_ABORTED; cpu_dest = *cpu_lookup; if (cpu_dest >= nr_cpus) { NO_TEAR_INC(rec->issue); return XDP_ABORTED; } return bpf_redirect_map(&cpu_map, cpu_dest, 0); } static __always_inline int cpumap_l4_port(struct xdp_md *ctx, bool src) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; __u32 key = bpf_get_smp_processor_id(); struct ethhdr *eth = data; __u8 ip_proto = IPPROTO_UDP; struct datarec *rec; __u16 eth_proto = 0; __u64 l3_offset = 0; __u32 cpu_dest = 0; __u32 *cpu_lookup; __u32 cpu_idx = 0; __u32 *cpu_max; __u32 key0 = 0; __u16 port; rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) return XDP_PASS; NO_TEAR_INC(rec->processed); cpu_max = bpf_map_lookup_elem(&cpus_count, &key0); if (!cpu_max) return XDP_ABORTED; if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) return XDP_PASS; /* Just skip */ /* Extract L4 source port */ switch (eth_proto) { case ETH_P_IP: ip_proto = get_proto_ipv4(ctx, l3_offset); switch (ip_proto) { case IPPROTO_TCP: port = get_port_ipv4_tcp(ctx, l3_offset, src); break; case IPPROTO_UDP: port = get_port_ipv4_udp(ctx, l3_offset, src); break; default: port = 0; } break; case ETH_P_IPV6: ip_proto = get_proto_ipv6(ctx, l3_offset); switch (ip_proto) { case IPPROTO_TCP: port = get_port_ipv6_tcp(ctx, l3_offset, src); break; case IPPROTO_UDP: port = get_port_ipv6_udp(ctx, l3_offset, src); break; default: port = 0; } break; default: port = 0; } cpu_idx = port % *cpu_max; cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); if (!cpu_lookup) return XDP_ABORTED; cpu_dest = *cpu_lookup; if (cpu_dest >= nr_cpus) { NO_TEAR_INC(rec->issue); return XDP_ABORTED; } return bpf_redirect_map(&cpu_map, cpu_dest, 0); } SEC("xdp") int cpumap_l4_sport(struct xdp_md *ctx) { return cpumap_l4_port(ctx, true); } SEC("xdp") int cpumap_l4_dport(struct xdp_md *ctx) { return cpumap_l4_port(ctx, false); } SEC("xdp/cpumap") int cpumap_redirect(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct ethhdr *eth = data; __u64 nh_off; nh_off = sizeof(*eth); if (data + nh_off > data_end) return XDP_DROP; swap_src_dst_mac(data); return bpf_redirect_map(&tx_port, 0, 0); } SEC("xdp/cpumap") int cpumap_pass(struct xdp_md *ctx) { return XDP_PASS; } SEC("xdp/cpumap") int cpumap_drop(struct xdp_md *ctx) { return XDP_DROP; } SEC("xdp/devmap") int redirect_egress_prog(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct ethhdr *eth = data; __u64 nh_off; nh_off = sizeof(*eth); if (data + nh_off > data_end) return XDP_DROP; __builtin_memcpy(eth->h_source, (const char *)tx_mac_addr, ETH_ALEN); return XDP_PASS; } char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/xdp-bench/xdp_redirect_cpumap.c000066400000000000000000000231571514310632100215110ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0-only /* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "logging.h" #include "xdp-bench.h" #include "xdp_sample.h" #include "xdp_redirect_cpumap.skel.h" static int map_fd; static int avail_fd; static int count_fd; static int mask = SAMPLE_RX_CNT | SAMPLE_CPUMAP_ENQUEUE_CNT | SAMPLE_CPUMAP_KTHREAD_CNT | SAMPLE_EXCEPTION_CNT; const struct cpumap_opts defaults_redirect_cpumap = { .mode = XDP_MODE_NATIVE, .interval = 2, .qsize = 2048, .program_mode = CPUMAP_CPU_L4_HASH, }; static const char *cpumap_prog_names[] = { "cpumap_no_touch", "cpumap_touch_data", "cpumap_round_robin", "cpumap_l4_proto", "cpumap_l4_filter", "cpumap_l4_hash", "cpumap_l4_sport", "cpumap_l4_dport", }; DEFINE_SAMPLE_INIT(xdp_redirect_cpumap); static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value, __u32 avail_idx, bool new) { __u32 curr_cpus_count = 0; __u32 key = 0; int ret; /* Add a CPU entry to cpumap, as this allocate a cpu entry in * the kernel for the cpu. */ ret = bpf_map_update_elem(map_fd, &cpu, value, 0); if (ret < 0) { pr_warn("Create CPU entry failed: %s\n", strerror(errno)); return ret; } /* Inform bpf_prog's that a new CPU is available to select * from via some control maps. */ ret = bpf_map_update_elem(avail_fd, &avail_idx, &cpu, 0); if (ret < 0) { pr_warn("Add to avail CPUs failed: %s\n", strerror(errno)); return ret; } /* When not replacing/updating existing entry, bump the count */ ret = bpf_map_lookup_elem(count_fd, &key, &curr_cpus_count); if (ret < 0) { pr_warn("Failed reading curr cpus_count: %s\n", strerror(errno)); return ret; } if (new) { curr_cpus_count++; ret = bpf_map_update_elem(count_fd, &key, &curr_cpus_count, 0); if (ret < 0) { pr_warn("Failed write curr cpus_count: %s\n", strerror(errno)); return ret; } } pr_debug("%s CPU: %u as idx: %u qsize: %d cpumap_prog_fd: %d (cpus_count: %u)\n", new ? "Add new" : "Replace", cpu, avail_idx, value->qsize, value->bpf_prog.fd, curr_cpus_count); return 0; } /* CPUs are zero-indexed. Thus, add a special sentinel default value * in map cpus_available to mark CPU index'es not configured */ static int mark_cpus_unavailable(void) { int ret, i, n_cpus = libbpf_num_possible_cpus(); __u32 invalid_cpu = n_cpus; for (i = 0; i < n_cpus; i++) { ret = bpf_map_update_elem(avail_fd, &i, &invalid_cpu, 0); if (ret < 0) { pr_warn("Failed marking CPU unavailable: %s\n", strerror(errno)); return ret; } } return 0; } /* Stress cpumap management code by concurrently changing underlying cpumap */ static void stress_cpumap(void *ctx) { struct bpf_cpumap_val *value = ctx; /* Changing qsize will cause kernel to free and alloc a new * bpf_cpu_map_entry, with an associated/complicated tear-down * procedure. */ value->qsize = 1024; create_cpu_entry(1, value, 0, false); value->qsize = 8; create_cpu_entry(1, value, 0, false); value->qsize = 16000; create_cpu_entry(1, value, 0, false); } static int set_cpumap_prog(struct xdp_redirect_cpumap *skel, enum cpumap_remote_action action, const struct iface *redir_iface) { struct bpf_devmap_val val = {}; __u32 key = 0; int err; switch (action) { case ACTION_DISABLED: return 0; case ACTION_DROP: return bpf_program__fd(skel->progs.cpumap_drop); case ACTION_PASS: return bpf_program__fd(skel->progs.cpumap_pass); case ACTION_REDIRECT: break; default: return -EINVAL; } if (!redir_iface->ifindex) { pr_warn("Must specify redirect device when using --remote-action 'redirect'\n"); return -EINVAL; } if (get_mac_addr(redir_iface->ifindex, skel->bss->tx_mac_addr) < 0) { pr_warn("Couldn't get MAC address for interface %s\n", redir_iface->ifname); return -EINVAL; } val.ifindex = redir_iface->ifindex; val.bpf_prog.fd = bpf_program__fd(skel->progs.redirect_egress_prog); err = bpf_map_update_elem(bpf_map__fd(skel->maps.tx_port), &key, &val, 0); if (err < 0) return -errno; return bpf_program__fd(skel->progs.cpumap_redirect); } int do_redirect_cpumap(const void *cfg, __unused const char *pin_root_path) { const struct cpumap_opts *opt = cfg; DECLARE_LIBBPF_OPTS(xdp_program_opts, opts); struct xdp_program *xdp_prog = NULL; struct xdp_redirect_cpumap *skel; struct bpf_program *prog = NULL; struct bpf_map_info info = {}; struct bpf_cpumap_val value; __u32 infosz = sizeof(info); int ret = EXIT_FAIL_OPTION; int n_cpus, fd; size_t i; if (opt->extended) sample_switch_mode(); if (opt->redir_iface.ifindex) mask |= SAMPLE_DEVMAP_XMIT_CNT_MULTI; n_cpus = libbpf_num_possible_cpus(); /* Notice: Choosing the queue size is very important when CPU is * configured with power-saving states. * * If deepest state take 133 usec to wakeup from (133/10^6). When link * speed is 10Gbit/s ((10*10^9/8) in bytes/sec). How many bytes can * arrive with in 133 usec at this speed: (10*10^9/8)*(133/10^6) = * 166250 bytes. With MTU size packets this is 110 packets, and with * minimum Ethernet (MAC-preamble + intergap) 84 bytes is 1979 packets. * * Setting default cpumap queue to 2048 as worst-case (small packet) * should be +64 packet due kthread wakeup call (due to xdp_do_flush) * worst-case is 2043 packets. * * Sysadm can configured system to avoid deep-sleep via: * tuned-adm profile network-latency */ skel = xdp_redirect_cpumap__open(); if (!skel) { pr_warn("Failed to xdp_redirect_cpumap__open: %s\n", strerror(errno)); ret = EXIT_FAIL_BPF; goto end; } /* Make sure we only load the one XDP program we are interested in */ while ((prog = bpf_object__next_program(skel->obj, prog)) != NULL) if (bpf_program__type(prog) == BPF_PROG_TYPE_XDP && bpf_program__expected_attach_type(prog) == BPF_XDP) bpf_program__set_autoload(prog, false); prog = bpf_object__find_program_by_name(skel->obj, cpumap_prog_names[opt->program_mode]); if (!prog) { pr_warn("Failed to find program '%s'\n", cpumap_prog_names[opt->program_mode]); goto end_destroy; } ret = sample_init_pre_load(skel, opt->iface_in.ifname); if (ret < 0) { pr_warn("Failed to sample_init_pre_load: %s\n", strerror(-ret)); ret = EXIT_FAIL_BPF; goto end_destroy; } if (bpf_map__set_max_entries(skel->maps.cpu_map, n_cpus) < 0) { pr_warn("Failed to set max entries for cpu_map map: %s", strerror(errno)); ret = EXIT_FAIL_BPF; goto end_destroy; } if (bpf_map__set_max_entries(skel->maps.cpus_available, n_cpus) < 0) { pr_warn("Failed to set max entries for cpus_available map: %s", strerror(errno)); ret = EXIT_FAIL_BPF; goto end_destroy; } ret = EXIT_FAIL_OPTION; skel->rodata->from_match[0] = opt->iface_in.ifindex; if (opt->redir_iface.ifindex) skel->rodata->to_match[0] = opt->redir_iface.ifindex; opts.obj = skel->obj; opts.prog_name = bpf_program__name(prog); xdp_prog = xdp_program__create(&opts); if (!xdp_prog) { ret = -errno; pr_warn("Couldn't open XDP program: %s\n", strerror(-ret)); goto end_destroy; } /* We always set the frags support bit: nothing the program does is * incompatible with multibuf, and it's perfectly fine to load a program * with frags support on an interface with a small MTU. We don't risk * setting any flags the kernel will balk at, either, since libxdp will * do the feature probing for us and skip the flag if the kernel doesn't * support it. * * The function below returns EOPNOTSUPP it libbpf is too old to support * setting the flags, but we just ignore that, since in such a case the * best we can do is just attempt to run without the frags support. */ xdp_program__set_xdp_frags_support(xdp_prog, true); ret = xdp_program__attach(xdp_prog, opt->iface_in.ifindex, opt->mode, 0); if (ret < 0) { pr_warn("Failed to attach XDP program: %s\n", strerror(-ret)); goto end_destroy; } ret = bpf_obj_get_info_by_fd(bpf_map__fd(skel->maps.cpu_map), &info, &infosz); if (ret < 0) { pr_warn("Failed bpf_obj_get_info_by_fd for cpumap: %s\n", strerror(errno)); goto end_detach; } skel->bss->cpumap_map_id = info.id; map_fd = bpf_map__fd(skel->maps.cpu_map); avail_fd = bpf_map__fd(skel->maps.cpus_available); count_fd = bpf_map__fd(skel->maps.cpus_count); ret = mark_cpus_unavailable(); if (ret < 0) { pr_warn("Unable to mark CPUs as unavailable\n"); goto end_detach; } ret = sample_init(skel, mask, opt->iface_in.ifindex, 0); if (ret < 0) { pr_warn("Failed to initialize sample: %s\n", strerror(-ret)); ret = EXIT_FAIL; goto end_detach; } fd = set_cpumap_prog(skel, opt->remote_action, &opt->redir_iface); if (fd < 0) { ret = EXIT_FAIL_BPF; goto end_detach; } value.qsize = opt->qsize; value.bpf_prog.fd = fd; for (i = 0; i < opt->cpus.num_vals; i++) { if (create_cpu_entry(opt->cpus.vals[i], &value, i, true) < 0) { pr_warn("Cannot proceed, exiting\n"); ret = EXIT_FAIL; goto end_detach; } } ret = sample_run(opt->interval, opt->stress_mode ? stress_cpumap : NULL, &value); if (ret < 0) { pr_warn("Failed during sample run: %s\n", strerror(-ret)); ret = EXIT_FAIL; goto end_detach; } ret = EXIT_OK; end_detach: xdp_program__detach(xdp_prog, opt->iface_in.ifindex, opt->mode, 0); end_destroy: xdp_program__close(xdp_prog); xdp_redirect_cpumap__destroy(skel); end: sample_teardown(); return ret; } xdp-tools-1.6.1/xdp-bench/xdp_redirect_devmap.bpf.c000066400000000000000000000047001514310632100222370ustar00rootroot00000000000000/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ #include #include #include #include #include /* The 2nd xdp prog on egress does not support skb mode, so we define two * maps, tx_port_general and tx_port_native. */ struct { __uint(type, BPF_MAP_TYPE_DEVMAP); __uint(key_size, sizeof(int)); __uint(value_size, sizeof(int)); __uint(max_entries, 1); } tx_port_general SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_DEVMAP); __uint(key_size, sizeof(int)); __uint(value_size, sizeof(struct bpf_devmap_val)); __uint(max_entries, 1); } tx_port_native SEC(".maps"); /* store egress interface mac address */ const volatile char tx_mac_addr[ETH_ALEN]; static __always_inline int xdp_redirect_devmap(struct xdp_md *ctx, void *redirect_map) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; __u32 key = bpf_get_smp_processor_id(); struct ethhdr *eth = data; struct datarec *rec; __u64 nh_off; nh_off = sizeof(*eth); if (data + nh_off > data_end) return XDP_DROP; rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) return XDP_PASS; NO_TEAR_INC(rec->processed); swap_src_dst_mac(data); return bpf_redirect_map(redirect_map, 0, 0); } SEC("xdp") int redir_devmap_general(struct xdp_md *ctx) { return xdp_redirect_devmap(ctx, &tx_port_general); } SEC("xdp") int redir_devmap_native(struct xdp_md *ctx) { return xdp_redirect_devmap(ctx, &tx_port_native); } SEC("xdp/devmap") int xdp_redirect_devmap_egress(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct ethhdr *eth = data; __u64 nh_off; nh_off = sizeof(*eth); if (data + nh_off > data_end) return XDP_DROP; __builtin_memcpy(eth->h_source, (const char *)tx_mac_addr, ETH_ALEN); return XDP_PASS; } SEC("xdp/devmap") int xdp_redirect_devmap_egress_drop(struct xdp_md *ctx) { return XDP_DROP; } char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/xdp-bench/xdp_redirect_devmap.c000066400000000000000000000157001514310632100214730ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "logging.h" #include "xdp-bench.h" #include "xdp_sample.h" #include "xdp_redirect_devmap.skel.h" static int mask = SAMPLE_RX_CNT | SAMPLE_EXCEPTION_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI; DEFINE_SAMPLE_INIT(xdp_redirect_devmap); const struct devmap_opts defaults_redirect_devmap = { .mode = XDP_MODE_NATIVE, .interval = 2 }; static struct bpf_program *egress_prog(struct xdp_redirect_devmap *skel, enum devmap_egress_action action) { switch (action) { case DEVMAP_EGRESS_DROP: return skel->progs.xdp_redirect_devmap_egress_drop; case DEVMAP_EGRESS_NONE: case DEVMAP_EGRESS_FORWARD: default: return skel->progs.xdp_redirect_devmap_egress; } } int do_redirect_devmap(const void *cfg, __unused const char *pin_root_path) { const struct devmap_opts *opt = cfg; struct xdp_program *xdp_prog = NULL, *dummy_prog = NULL; const char *prog_name = "redir_devmap_native"; DECLARE_LIBBPF_OPTS(xdp_program_opts, opts); struct bpf_devmap_val devmap_val = {}; struct bpf_map *tx_port_map = NULL; struct xdp_redirect_devmap *skel; struct bpf_program *prog = NULL; char str[2 * IF_NAMESIZE + 1]; int ret = EXIT_FAIL_OPTION; bool tried = false; int key = 0; if (opt->extended) sample_switch_mode(); if (opt->mode == XDP_MODE_SKB) /* devmap_xmit tracepoint not available */ mask &= ~(SAMPLE_DEVMAP_XMIT_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI); if (opt->stats) mask |= SAMPLE_REDIRECT_CNT; if (!opt->load_egress && opt->egress_action != DEVMAP_EGRESS_NONE) { pr_warn("egress-action option was set without load-egress\n"); goto end; } restart: skel = xdp_redirect_devmap__open(); if (!skel) { pr_warn("Failed to xdp_redirect_devmap__open: %s\n", strerror(errno)); ret = EXIT_FAIL_BPF; goto end; } /* Make sure we only load the one XDP program we are interested in */ while ((prog = bpf_object__next_program(skel->obj, prog)) != NULL) if (bpf_program__type(prog) == BPF_PROG_TYPE_XDP && bpf_program__expected_attach_type(prog) == BPF_XDP) bpf_program__set_autoload(prog, false); if (tried) { tx_port_map = skel->maps.tx_port_general; bpf_program__set_autoload(egress_prog(skel, opt->egress_action), false); #ifdef HAVE_LIBBPF_BPF_MAP__SET_AUTOCREATE bpf_map__set_autocreate(skel->maps.tx_port_native, false); #else pr_warn("Libbpf is missing bpf_map__set_autocreate(), fallback won't work\n"); ret = EXIT_FAIL_BPF; goto end_destroy; #endif } else { #ifdef HAVE_LIBBPF_BPF_MAP__SET_AUTOCREATE bpf_map__set_autocreate(skel->maps.tx_port_general, false); #endif tx_port_map = skel->maps.tx_port_native; } ret = sample_init_pre_load(skel, opt->iface_in.ifname); if (ret < 0) { pr_warn("Failed to sample_init_pre_load: %s\n", strerror(-ret)); ret = EXIT_FAIL_BPF; goto end_destroy; } /* Load 2nd xdp prog on egress. */ if (opt->load_egress) { ret = get_mac_addr(opt->iface_out.ifindex, skel->rodata->tx_mac_addr); if (ret < 0) { pr_warn("Failed to get interface %s mac address: %s\n", opt->iface_out.ifname, strerror(-ret)); ret = EXIT_FAIL; goto end_destroy; } } skel->rodata->from_match[0] = opt->iface_in.ifindex; skel->rodata->to_match[0] = opt->iface_out.ifindex; opts.obj = skel->obj; opts.prog_name = prog_name; xdp_prog = xdp_program__create(&opts); if (!xdp_prog) { ret = -errno; pr_warn("Couldn't open XDP program: %s\n", strerror(-ret)); goto end_destroy; } /* We always set the frags support bit: nothing the program does is * incompatible with multibuf, and it's perfectly fine to load a program * with frags support on an interface with a small MTU. We don't risk * setting any flags the kernel will balk at, either, since libxdp will * do the feature probing for us and skip the flag if the kernel doesn't * support it. * * The function below returns EOPNOTSUPP it libbpf is too old to support * setting the flags, but we just ignore that, since in such a case the * best we can do is just attempt to run without the frags support. */ xdp_program__set_xdp_frags_support(xdp_prog, true); ret = xdp_program__attach(xdp_prog, opt->iface_in.ifindex, opt->mode, 0); if (ret < 0) { /* First try with struct bpf_devmap_val as value for generic * mode, then fallback to sizeof(int) for older kernels. */ if (!opt->load_egress && !tried) { pr_warn("Attempting fallback to int-sized devmap\n"); prog_name = "redir_devmap_general"; tried = true; xdp_program__close(xdp_prog); xdp_redirect_devmap__destroy(skel); sample_teardown(); xdp_prog = NULL; goto restart; } pr_warn("Failed to attach XDP program: %s\n", strerror(-ret)); ret = EXIT_FAIL_XDP; goto end_destroy; } ret = sample_init(skel, mask, opt->iface_in.ifindex, opt->iface_out.ifindex); if (ret < 0) { pr_warn("Failed to initialize sample: %s\n", strerror(-ret)); ret = EXIT_FAIL; goto end_detach; } opts.obj = NULL; opts.prog_name = "xdp_pass"; opts.find_filename = "xdp-dispatcher.o"; dummy_prog = xdp_program__create(&opts); if (!dummy_prog) { pr_warn("Failed to load dummy program: %s\n", strerror(errno)); ret = EXIT_FAIL_BPF; goto end_detach; } xdp_program__set_xdp_frags_support(dummy_prog, true); ret = xdp_program__attach(dummy_prog, opt->iface_out.ifindex, opt->mode, 0); if (ret < 0) { pr_warn("Failed to attach dummy program: %s\n", strerror(-ret)); ret = EXIT_FAIL_BPF; goto end_detach; } devmap_val.ifindex = opt->iface_out.ifindex; if (opt->load_egress) { struct bpf_program *prog = egress_prog(skel, opt->egress_action); devmap_val.bpf_prog.fd = bpf_program__fd(prog); } ret = bpf_map_update_elem(bpf_map__fd(tx_port_map), &key, &devmap_val, 0); if (ret < 0) { pr_warn("Failed to update devmap value: %s\n", strerror(errno)); ret = EXIT_FAIL_BPF; goto end_detach; } ret = EXIT_FAIL; safe_strncpy(str, get_driver_name(opt->iface_in.ifindex), sizeof(str)); pr_info("Redirecting from %s (ifindex %d; driver %s) to %s (ifindex %d; driver %s)\n", opt->iface_in.ifname, opt->iface_in.ifindex, str, opt->iface_out.ifname, opt->iface_out.ifindex, get_driver_name(opt->iface_out.ifindex)); ret = sample_run(opt->interval, NULL, NULL); if (ret < 0) { pr_warn("Failed during sample run: %s\n", strerror(-ret)); ret = EXIT_FAIL; goto end_destroy; } ret = EXIT_OK; end_detach: if (dummy_prog) xdp_program__detach(dummy_prog, opt->iface_out.ifindex, opt->mode, 0); xdp_program__detach(xdp_prog, opt->iface_in.ifindex, opt->mode, 0); end_destroy: xdp_program__close(xdp_prog); xdp_program__close(dummy_prog); xdp_redirect_devmap__destroy(skel); end: sample_teardown(); return ret; } xdp-tools-1.6.1/xdp-bench/xdp_redirect_devmap_multi.bpf.c000066400000000000000000000036001514310632100234470ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 #include #include #include #include #include struct { __uint(type, BPF_MAP_TYPE_DEVMAP_HASH); __uint(key_size, sizeof(int)); __uint(value_size, sizeof(int)); __uint(max_entries, 32); } forward_map_general SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_DEVMAP_HASH); __uint(key_size, sizeof(int)); __uint(value_size, sizeof(struct bpf_devmap_val)); __uint(max_entries, 32); } forward_map_native SEC(".maps"); /* map to store egress interfaces mac addresses */ struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, __u32); __type(value, __be64); __uint(max_entries, 32); } mac_map SEC(".maps"); static int xdp_redirect_devmap_multi(struct xdp_md *ctx, void *forward_map) { __u32 key = bpf_get_smp_processor_id(); struct datarec *rec; rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) return XDP_PASS; NO_TEAR_INC(rec->processed); return bpf_redirect_map(forward_map, 0, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS); } SEC("xdp") int redir_multi_general(struct xdp_md *ctx) { return xdp_redirect_devmap_multi(ctx, &forward_map_general); } SEC("xdp") int redir_multi_native(struct xdp_md *ctx) { return xdp_redirect_devmap_multi(ctx, &forward_map_native); } SEC("xdp/devmap") int xdp_devmap_prog(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; __u32 key = ctx->egress_ifindex; struct ethhdr *eth = data; __be64 *mac; __u64 nh_off; nh_off = sizeof(*eth); if (data + nh_off > data_end) return XDP_DROP; mac = bpf_map_lookup_elem(&mac_map, &key); if (mac) __builtin_memcpy(eth->h_source, mac, ETH_ALEN); return XDP_PASS; } SEC("xdp/devmap") int xdp_redirect_devmap_egress_drop(struct xdp_md *ctx) { return XDP_DROP; } char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/xdp-bench/xdp_redirect_devmap_multi.c000066400000000000000000000146471514310632100227160ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "logging.h" #include "xdp_sample.h" #include "xdp-bench.h" #include "xdp_redirect_devmap_multi.skel.h" static int ifaces[MAX_IFACE_NUM] = {}; static int mask = SAMPLE_RX_CNT | SAMPLE_EXCEPTION_CNT | SAMPLE_DEVMAP_XMIT_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI | SAMPLE_SKIP_HEADING; DEFINE_SAMPLE_INIT(xdp_redirect_devmap_multi); static int update_mac_map(struct bpf_map *map) { int mac_map_fd = bpf_map__fd(map); unsigned char mac_addr[6]; unsigned int ifindex; int i, ret = -1; for (i = 0; ifaces[i] > 0; i++) { ifindex = ifaces[i]; ret = get_mac_addr(ifindex, mac_addr); if (ret < 0) { pr_warn("get interface %d mac failed\n", ifindex); return ret; } ret = bpf_map_update_elem(mac_map_fd, &ifindex, mac_addr, 0); if (ret < 0) { pr_warn("Failed to update mac address for ifindex %d\n", ifindex); return ret; } } return 0; } const struct devmap_multi_opts defaults_redirect_devmap_multi = { .mode = XDP_MODE_NATIVE, .interval = 2 }; static struct bpf_program * egress_prog_multi(struct xdp_redirect_devmap_multi *skel, enum devmap_egress_action action) { switch (action) { case DEVMAP_EGRESS_DROP: return skel->progs.xdp_redirect_devmap_egress_drop; case DEVMAP_EGRESS_NONE: case DEVMAP_EGRESS_FORWARD: default: return skel->progs.xdp_devmap_prog; } } int do_redirect_devmap_multi(const void *cfg, __unused const char *pin_root_path) { const struct devmap_multi_opts *opt = cfg; const char *prog_name = "redir_multi_native"; DECLARE_LIBBPF_OPTS(xdp_program_opts, opts); struct xdp_redirect_devmap_multi *skel; struct bpf_devmap_val devmap_val = {}; struct xdp_program *xdp_prog = NULL; struct bpf_map *forward_map = NULL; bool first = true, tried = false; struct bpf_program *prog = NULL; int ret = EXIT_FAIL_OPTION; struct iface *iface; int i; if (opt->extended) sample_switch_mode(); if (opt->mode == XDP_MODE_SKB) /* devmap_xmit tracepoint not available */ mask &= ~(SAMPLE_DEVMAP_XMIT_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI); if (opt->stats) mask |= SAMPLE_REDIRECT_CNT; if (!opt->load_egress && opt->egress_action != DEVMAP_EGRESS_NONE) { pr_warn("egress-action option was set without load-egress\n"); goto end; } restart: skel = xdp_redirect_devmap_multi__open(); if (!skel) { pr_warn("Failed to xdp_redirect_devmap_multi__open: %s\n", strerror(errno)); ret = EXIT_FAIL_BPF; goto end; } /* Make sure we only load the one XDP program we are interested in */ while ((prog = bpf_object__next_program(skel->obj, prog)) != NULL) if (bpf_program__type(prog) == BPF_PROG_TYPE_XDP && bpf_program__expected_attach_type(prog) == BPF_XDP) bpf_program__set_autoload(prog, false); if (tried) { forward_map = skel->maps.forward_map_general; bpf_program__set_autoload(egress_prog_multi(skel, opt->egress_action), false); #ifdef HAVE_LIBBPF_BPF_MAP__SET_AUTOCREATE bpf_map__set_autocreate(skel->maps.forward_map_native, false); #else pr_warn("Libbpf is missing bpf_map__set_autocreate(), fallback won't work\n"); ret = EXIT_FAIL_BPF; goto end_destroy; #endif } else { #ifdef HAVE_LIBBPF_BPF_MAP__SET_AUTOCREATE bpf_map__set_autocreate(skel->maps.forward_map_general, false); #endif forward_map = skel->maps.forward_map_native; } ret = sample_init_pre_load(skel, NULL); if (ret < 0) { pr_warn("Failed to sample_init_pre_load: %s\n", strerror(-ret)); ret = EXIT_FAIL_BPF; goto end_destroy; } ret = EXIT_FAIL_OPTION; /* opt parsing enforces num <= MAX_IFACES_NUM */ for (i = 0, iface = opt->ifaces; iface; i++, iface = iface->next) { skel->rodata->from_match[i] = iface->ifindex; skel->rodata->to_match[i] = iface->ifindex; } opts.obj = skel->obj; opts.prog_name = prog_name; xdp_prog = xdp_program__create(&opts); if (!xdp_prog) { ret = -errno; pr_warn("Couldn't open XDP program: %s\n", strerror(-ret)); goto end_destroy; } for (iface = opt->ifaces; iface; iface = iface->next) { pr_debug("Loading program on interface %s\n", iface->ifname); ret = xdp_program__attach(xdp_prog, iface->ifindex, opt->mode, 0); if (ret) { if (first) { if (!opt->load_egress && !tried) { pr_warn("Attempting fallback to int-sized devmap\n"); prog_name = "redir_multi_general"; tried = true; xdp_program__close(xdp_prog); xdp_redirect_devmap_multi__destroy(skel); sample_teardown(); xdp_prog = NULL; goto restart; } pr_warn("Failed to attach XDP program to iface %s: %s\n", iface->ifname, strerror(-ret)); goto end_destroy; } pr_warn("Failed to attach XDP program to iface %s: %s\n", iface->ifname, strerror(-ret)); goto end_detach; } /* Add all the interfaces to forward group and attach * egress devmap program if exist */ devmap_val.ifindex = iface->ifindex; if (opt->load_egress) { struct bpf_program *prog = egress_prog_multi(skel, opt->egress_action); devmap_val.bpf_prog.fd = bpf_program__fd(prog); } ret = bpf_map_update_elem(bpf_map__fd(forward_map), &iface->ifindex, &devmap_val, 0); if (ret < 0) { pr_warn("Failed to update devmap value: %s\n", strerror(errno)); ret = EXIT_FAIL_BPF; goto end_detach; } first = false; } if (opt->load_egress) { /* Update mac_map with all egress interfaces' mac addr */ if (update_mac_map(skel->maps.mac_map) < 0) { pr_warn("Updating mac address failed\n"); ret = EXIT_FAIL; goto end_detach; } } ret = sample_init(skel, mask, 0, 0); if (ret < 0) { pr_warn("Failed to initialize sample: %s\n", strerror(-ret)); ret = EXIT_FAIL; goto end_detach; } ret = sample_run(opt->interval, NULL, NULL); if (ret < 0) { pr_warn("Failed during sample run: %s\n", strerror(-ret)); ret = EXIT_FAIL; goto end_detach; } ret = EXIT_OK; end_detach: for (iface = opt->ifaces; iface; iface = iface->next) xdp_program__detach(xdp_prog, iface->ifindex, opt->mode, 0); end_destroy: xdp_program__close(xdp_prog); xdp_redirect_devmap_multi__destroy(skel); end: sample_teardown(); return ret; } xdp-tools-1.6.1/xdp-bench/xdp_socket.c000066400000000000000000000027601514310632100176300ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "logging.h" #include "xdp-bench.h" #include "xdp_sample.h" const struct xsk_opts defaults_xsk = { .attach_mode = XDP_MODE_NATIVE, .interval = 2, .retries = 3, .frame_size = 4096, .batch_size = 64, .tx_pkt_size = 64, .sched_policy = XSK_SCHED_OTHER, .clock = XSK_CLOCK_MONOTONIC, }; static int do_xsk(const struct xsk_opts *opt, enum xsk_benchmark_type bench) { struct xsk_ctx *ctx; pthread_t pt; int ret; ret = xsk_validate_opts(opt); if (ret) return ret; ctx = xsk_ctx__create(opt, bench); ret = libxdp_get_error(ctx); if (ret) return ret; pr_info("%s packets on %s (ifindex %d; queue %d; driver %s) using AF_XDP sockets\n", bench == XSK_BENCH_RXDROP ? "Dropping" : "Hairpinning", opt->iface.ifname, opt->iface.ifindex, opt->queue_idx, get_driver_name(opt->iface.ifindex)); ret = xsk_start_bench(ctx, &pt); if (ret) goto out; ret = xsk_stats_poller(ctx); pthread_join(pt, NULL); out: xsk_ctx__destroy(ctx); return ret; } int do_xsk_drop(const void *cfg, __unused const char *pin_root_path) { const struct xsk_opts *opt = cfg; return do_xsk(opt, XSK_BENCH_RXDROP); } int do_xsk_tx(const void *cfg, __unused const char *pin_root_path) { const struct xsk_opts *opt = cfg; return do_xsk(opt, XSK_BENCH_L2FWD); } xdp-tools-1.6.1/xdp-dump/000077500000000000000000000000001514310632100152025ustar00rootroot00000000000000xdp-tools-1.6.1/xdp-dump/.gitignore000066400000000000000000000000131514310632100171640ustar00rootroot00000000000000*~ xdpdump xdp-tools-1.6.1/xdp-dump/Makefile000066400000000000000000000011001514310632100166320ustar00rootroot00000000000000# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) TOOL_NAME := xdpdump XDP_TARGETS := xdpdump_bpf xdpdump_xdp USER_TARGETS := xdpdump TEST_FILE := tests/test-xdpdump.sh # Disable warnings about VLAs not being at the end of a structure when building # with clang. The code is fine, but clang's complaint coupled with -Werror would # break the build. See https://github.com/xdp-project/xdp-tools/issues/304 CFLAGS += "-Wno-gnu-variable-sized-type-not-at-end" LIB_DIR = ../lib USER_LIBS = -lpcap MAN_PAGE := xdpdump.8 include $(LIB_DIR)/common.mk xdp-tools-1.6.1/xdp-dump/README.org000066400000000000000000000256271514310632100166640ustar00rootroot00000000000000#+EXPORT_FILE_NAME: xdpdump #+TITLE: xdpdump #+MAN_CLASS_OPTIONS: :section-id "8\" \"DATE\" \"VERSION\" \"a simple tcpdump like tool for capturing packets at the XDP layer" # This file serves both as a README on github, and as the source for the man # page; the latter through the org-mode man page export support. # . # To export the man page, simply use the org-mode exporter; (require 'ox-man) if # it's not available. # . # The org-mode export doesn't support extended title lines, so manually copy # over the first line of the resulting .man file before exporting and # committing. * xdpdump - a simple tcpdump like tool for capturing packets at the XDP layer =xdpdump= is a simple XDP packet capture tool that tries to behave similar to =tcpdump=, however, it has no packet filter or decode capabilities. This can be used for debugging XDP programs that are already loaded on an interface. Packets can be dumped/inspected before on *entry* to XDP program, or after at *exit* from an XDP program. Furthermore, at *exit* the XDP action is also captured. This means that even packets that are dropped at the XDP layer can be captured via this tool. =xdpdump= works by attaching a bpf trace program to the XDP entry and/or exit function which stores the raw packet in a perf trace buffer. If no XDP program is loaded this approach can not be used and the tool will use a libpcap live-capture to be backward compatible. ** Running xdpdump The syntax for running =xdpdump= is: #+begin_src Usage: xdpdump [options] XDPDump tool to dump network traffic Options: --rx-capture Capture point for the rx direction (valid values: entry,exit) -D, --list-interfaces Print the list of available interfaces -i, --interface Name of interface to capture on --perf-wakeup Wake up xdpdump every packets -p, --program-names Specific program to attach to -s, --snapshot-length Minimum bytes of packet to capture --use-pcap Use legacy pcap format for XDP traces -w, --write Write raw packets to pcap file -x, --hex Print the full packet in hex -v, --verbose Enable verbose logging (-vv: more verbose) --version Display version information -h, --help Show this help #+end_src * The options explained The =xdpdump= tool tries to mimic the basic =tcpdump= options, but just in case below each of the available options is explained: ** --rx-capture Specify where the ingress packet gets captured. Either at the entry of the XDP program and/or exit of the XDP program. Valid options are *entry*, *exit*, or both *entry,exit*. The packet at *exit* can be modified by the XDP program. If you are interested to see both the original and modified packet, use the *entry,exit* option. With this, each packet is captured twice. The default value for this is *entry*. ** -D, --list-interfaces Display a list of available interfaces and any XDP program loaded ** --load-xdp-mode Specifies which loader mode to use with the =--load-xdp-program= option. The valid values are ‘native’, which is the default in-driver XDP mode, ‘skb’, which causes the so-called skb mode (also known as generic XDP) to be used, ‘hw’ which causes the program to be offloaded to the hardware, or ‘unspecified’ which leaves it up to the kernel to pick a mode (which it will do by picking native mode if the driver supports it, or generic mode otherwise). Note that using ‘unspecified’ can make it difficult to predict what mode a program will end up being loaded in. For this reason, the default is ‘native’. ** --load-xdp-program If no XDP program is loaded on the interface, by default, xdpdump will fallback to libpcap's live capture mode to capture the packets. Alternatively, with this option, you can ask xdpdump to load an XDP program to capture the packets directly. ** -i, --interface Listen on interface =ifname=. Note that if no XDP program is loaded on the interface it will use libpcap's live capture mode to capture the packets. ** --perf-wakeup :feat_perfbuf: Let the Kernel wake up =xdpdump= once for every == being posted in the perf ring buffer. The higher the number the less the impact is on the actual XDP program. The default value is 0, which automatically calculates the value based on the available CPUs/buffers. Use -v to see the actual used value. ** -p, --program-names [|all] This option allows you to capture packets for a specific, set of, or all XDP programs loaded on the interface. You can either specify the actual program names or program IDs separated by commas. In the case where multiple programs are attached with the same name, you should use the program ID. Use the -D option to see the loaded programs and their IDs. In addition, the Linux API does not provide the full name of the attached eBPF entry function if it's longer than 15 characters. xdpdump will try to guess the correct function name from the available BTF debug information. However, if multiple functions exist with the same leading name, it can not pick the correct one. It will dump the available functions, and you can choose the correct one, and supply it with this option. If you have programs with duplicate long names, you also need to specify the program ID with the full name. This can be done by adding the id to the name with the =@= suffix. ** -P, --promiscuous-mode This option puts the interface into promiscuous mode. ** -s, --snapshot-length Capture *snaplen* bytes of a packet rather than the default 262144 bytes. ** --use-pcap Use legacy pcap format for XDP traces. By default, it will use the PcapNG format so that it can store various metadata. ** -w, --write Write the raw packets to a pcap file rather than printing them out hexadecimal. Standard output is used if *file* is =-=. ** -x, --hex When dumping packets on the console also print the full packet content in hex. ** -v, --verbose Enable debug logging. Specify twice for even more verbosity. ** --version Display =xpdump= version information and exit. ** -h, --help Display a summary of the available options * Examples The below will load the =xdp-filter= program on eth0, but it does not do any actual filtering: #+begin_src # xdp-filter load --mode skb eth0 # # xdpdump -D Interface Prio Program name Mode ID Tag Chain actions -------------------------------------------------------------------------------------- lo eth0 xdp_dispatcher skb 10651 d51e469e988d81da => 10 xdpfilt_alw_all 10669 0b394f43ab24501c XDP_PASS #+end_src Now we can try =xdpdump=: #+begin_src # xdpdump -i eth0 -x listening on eth0, ingress XDP program ID 10651 func xdp_dispatcher, capture mode entry, capture size 262144 bytes 1584373839.460733895: xdp_dispatcher()@entry: packet size 102 bytes, captured 102 bytes on if_index 2, rx queue 0, id 1 0x0000: 52 54 00 db 44 b6 52 54 00 34 38 da 08 00 45 48 RT..D.RT.48...EH 0x0010: 00 58 d7 dd 40 00 40 06 ec c3 c0 a8 7a 01 c0 a8 .X..@.@.....z... 0x0020: 7a 64 9c de 00 16 0d d5 c6 bc 46 c9 bb 11 80 18 zd........F..... 0x0030: 01 f5 7b b4 00 00 01 01 08 0a 77 0a 8c b8 40 12 ..{.......w...@. 0x0040: cc a6 00 00 00 10 54 ce 6e 20 c3 e7 da 6c 08 42 ......T.n ...l.B 0x0050: d6 d9 ee 42 42 f0 82 c9 4f 12 ed 7b 19 ab 22 0d ...BB...O..{..". 0x0060: 09 29 a9 ee df 89 .).... 1584373839.462340808: xdp_dispatcher()@entry: packet size 66 bytes, captured 66 bytes on if_index 2, rx queue 0, id 2 0x0000: 52 54 00 db 44 b6 52 54 00 34 38 da 08 00 45 48 RT..D.RT.48...EH 0x0010: 00 34 d7 de 40 00 40 06 ec e6 c0 a8 7a 01 c0 a8 .4..@.@.....z... 0x0020: 7a 64 9c de 00 16 0d d5 c6 e0 46 c9 bc 85 80 10 zd........F..... 0x0030: 01 f5 74 0c 00 00 01 01 08 0a 77 0a 8c ba 40 12 ..t.......w...@. 0x0040: d2 34 .4 ^C 2 packets captured 0 packets dropped by perf ring #+end_src Below are two more examples redirecting the capture file to =tcpdump= or =tshark=: #+begin_src # xdpdump -i eth0 -w - | tcpdump -r - -n listening on eth0, ingress XDP program ID 10651 func xdp_dispatcher, capture mode entry, capture size 262144 bytes reading from file -, link-type EN10MB (Ethernet) 15:55:09.075887 IP 192.168.122.1.40928 > 192.168.122.100.ssh: Flags [P.], seq 3857553815:3857553851, ack 3306438882, win 501, options [nop,nop,TS val 1997449167 ecr 1075234328], length 36 15:55:09.077756 IP 192.168.122.1.40928 > 192.168.122.100.ssh: Flags [.], ack 37, win 501, options [nop,nop,TS val 1997449169 ecr 1075244363], length 0 15:55:09.750230 IP 192.168.122.1.40928 > 192.168.122.100.ssh: Flags [P.], seq 36:72, ack 37, win 501, options [nop,nop,TS val 1997449842 ecr 1075244363], length 36 #+end_src #+begin_src # xdpdump -i eth0 -w - | tshark -r - -n listening on eth0, ingress XDP program ID 10651 func xdp_dispatcher, capture mode entry, capture size 262144 bytes 1 0.000000 192.168.122.1 → 192.168.122.100 SSH 102 Client: Encrypted packet (len=36) 2 0.000646 192.168.122.1 → 192.168.122.100 TCP 66 40158 → 22 [ACK] Seq=37 Ack=37 Win=1467 Len=0 TSval=1997621571 TSecr=1075416765 3 12.218164 192.168.122.1 → 192.168.122.100 SSH 102 Client: Encrypted packet (len=36) #+end_src One final example capturing specific XDP programs loaded on the interface: #+begin_src # xdpdump -D Interface Prio Program name Mode ID Tag Chain actions -------------------------------------------------------------------------------------- lo eth0 xdp_dispatcher skb 10558 d51e469e988d81da => 5 xdp_test_prog_w 10576 b5a46c6e9935298c XDP_PASS => 10 xdp_pass 10582 3b185187f1855c4c XDP_PASS => 10 xdp_pass 10587 3b185187f1855c4c XDP_PASS #+end_src We would like to see the packets on the =xdp_dispatcher()= and the 2nd =xdp_pass()= program: #+begin_src # xdpdump -i eth0 --rx-capture=entry,exit -p xdp_dispatcher,xdp_pass@10587 or # xdpdump -i eth0 --rx-capture=entry,exit -p 10558,10587 listening on eth0, ingress XDP program ID 10558 func xdp_dispatcher, ID 10587 func xdp_pass, capture mode entry/exit, capture size 262144 bytes 1607694215.501287259: xdp_dispatcher()@entry: packet size 102 bytes on if_index 2, rx queue 0, id 1 1607694215.501371504: xdp_pass()@entry: packet size 102 bytes on if_index 2, rx queue 0, id 1 1607694215.501383099: xdp_pass()@exit[PASS]: packet size 102 bytes on if_index 2, rx queue 0, id 1 1607694215.501394709: xdp_dispatcher()@exit[PASS]: packet size 102 bytes on if_index 2, rx queue 0, id 1 ^C 4 packets captured 0 packets dropped by perf ring #+end_src * BUGS Please report any bugs on Github: https://github.com/xdp-project/xdp-tools/issues * AUTHOR =xdpdump= was written by Eelco Chaudron xdp-tools-1.6.1/xdp-dump/tests/000077500000000000000000000000001514310632100163445ustar00rootroot00000000000000xdp-tools-1.6.1/xdp-dump/tests/test-xdpdump.sh000066400000000000000000000674471514310632100213600ustar00rootroot00000000000000# # Test scrip to do basic xdpdump checks # # shellcheck disable=2039 # ALL_TESTS="test_help test_interfaces test_capt_pcap test_capt_pcapng test_capt_term test_exitentry test_snap test_multi_pkt test_perf_wakeup test_promiscuous_selfload test_promiscuous_preload test_none_xdp test_pname_parse test_multi_prog test_xdp_load" XDPDUMP=${XDPDUMP:-./xdpdump} XDP_LOADER=${XDP_LOADER:-../xdp-loader/xdp-loader} TEST_RETRIES=3 RESULT="" print_result() { if [ -n "$1" ]; then echo "ERROR: $1" echo "==== RESULT: ====" echo "$RESULT" echo "==== END ====" else echo "$RESULT" fi } test_help() { local XDPDUMP_HELP_TEXT XDPDUMP_HELP_TEXT=$(cat <<-END Usage: xdpdump [options] XDPDump tool to dump network traffic Options: --rx-capture Capture point for the rx direction (valid values: entry,exit) -D, --list-interfaces Print the list of available interfaces --load-xdp-mode Mode used for --load-xdp-mode, default native (valid values: native,skb,hw,unspecified) --load-xdp-program Load XDP trace program if no XDP program is loaded -i, --interface Name of interface to capture on --perf-wakeup Wake up xdpdump every packets -p, --program-names Specific program to attach to -P, --promiscuous-mode Open interface in promiscuous mode -s, --snapshot-length Minimum bytes of packet to capture --use-pcap Use legacy pcap format for XDP traces -w, --write Write raw packets to pcap file -x, --hex Print the full packet in hex -v, --verbose Enable verbose logging (-vv: more verbose) --version Display version information -h, --help Show this help END ) $XDPDUMP --help | grep -q -- "--perf-wakeup" if [ $? -eq 1 ]; then XDPDUMP_HELP_TEXT=$(echo "$XDPDUMP_HELP_TEXT" | sed '/--perf-wakeup /d') fi RESULT=$($XDPDUMP --help) if [ "$RESULT" != "$XDPDUMP_HELP_TEXT" ]; then print_result "The --help output failed" return 1 fi RESULT=$($XDPDUMP -h) if [ "$RESULT" != "$XDPDUMP_HELP_TEXT" ]; then print_result "The -h output failed" return 1 fi } test_interfaces() { local NO_PROG_REGEX="($NS +)" if is_multiprog_supported; then local PROG_REGEX="($NS[[:space:]]+xdp_dispatcher.+xdp_drop)" else local PROG_REGEX="($NS[[:space:]]+xdp_drop)" fi RESULT=$($XDPDUMP -D) if ! [[ $RESULT =~ $NO_PROG_REGEX ]]; then print_result "Failed showing test interface with no XPD program loaded" return 1 fi RESULT=$($XDPDUMP --list-interfaces) if ! [[ $RESULT =~ $NO_PROG_REGEX ]]; then print_result "Failed showing test interface with no XPD program loaded" return 1 fi $XDP_LOADER load "$NS" "$TEST_PROG_DIR/xdp_drop.o" RESULT=$($XDPDUMP -D) if ! [[ $RESULT =~ $PROG_REGEX ]]; then print_result "Failed showing test interface with XPD program loaded" return 1 fi $XDP_LOADER unload "$NS" --all } test_capt_pcap() { skip_if_missing_kernel_symbol bpf_xdp_output_proto skip_if_missing_trace_attach local PASS_PKT="IP6 $INSIDE_IP6 > $OUTSIDE_IP6: ICMP6, echo reply(, id [0-9]+)?, seq 1, length 64" $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 $XDP_LOADER load "$NS" "$TEST_PROG_DIR/test_long_func_name.o" || return 1 PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name --use-pcap -w - | tcpdump -r - -n") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") $XDP_LOADER unload "$NS" --all || return 1 if ! [[ $RESULT =~ $PASS_PKT ]]; then print_result "IPv6 packet not received" return 1 fi } version_greater_or_equal() { printf '%s\n%s\n' "$2" "$1" | sort -V -C } test_capt_pcapng() { skip_if_missing_kernel_symbol bpf_xdp_output_proto skip_if_missing_trace_attach local PCAP_FILE="/tmp/${NS}_PID_$$_$RANDOM.pcap" local PASS_PKT="IP6 $INSIDE_IP6 > $OUTSIDE_IP6: ICMP6, echo reply(, id [0-9]+)?, seq 1, length 64" local HW=$(uname -m | sed -e 's/[]\/$*+.^|[]/\\&/g') local OS=$(uname -snrv | sed -e 's/[]\/$+*.^()|[]/\\&/g') local INFOS_REGEX="" local OLD_CAPINFOS=0 local TSHARK_VERSION=$(tshark --version 2> /dev/null | sed -ne 's/^TShark (Wireshark) \([0-9]\+\.[0-9]\+\.[0-9]\+\).*/\1/p') if [[ "$(capinfos --help)" == *"Capinfos (Wireshark) 2."* ]]; then OLD_CAPINFOS=1 fi INFOS_REGEX+="(File type: Wireshark\/\.\.\. - pcapng.*" INFOS_REGEX+="Capture hardware: $HW.*" INFOS_REGEX+="Capture oper-sys: $OS.*" INFOS_REGEX+="Capture application: xdpdump v[0-9]+\.[0-9]+\.[0-9]+.*" INFOS_REGEX+="Capture comment: Capture was taken on interface $NS, with the following XDP programs loaded: xdp_dispatcher\(\) xdp_test_prog_w.*" INFOS_REGEX+="Interface #0 info:.*" INFOS_REGEX+="Name = ${NS}:xdp_test_prog_with_a_long_name\(\)@fentry.*" if [ $OLD_CAPINFOS -eq 0 ]; then INFOS_REGEX+="Hardware = driver: \"veth\", version: \"1\.0\", fw-version: \"\", rom-version: \"\", bus-info: \"\".*" fi INFOS_REGEX+="Time precision = nanoseconds \(9\).*" INFOS_REGEX+="Interface #1 info:.*" INFOS_REGEX+="Name = ${NS}:xdp_test_prog_with_a_long_name\(\)@fexit.*" if [ $OLD_CAPINFOS -eq 0 ]; then INFOS_REGEX+="Hardware = driver: \"veth\", version: \"1\.0\", fw-version: \"\", rom-version: \"\", bus-info: \"\".*" fi INFOS_REGEX+="Time precision = nanoseconds \(9\))" $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 $XDP_LOADER load "$NS" "$TEST_PROG_DIR/test_long_func_name.o" || return 1 PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name -w - | tcpdump -r - -n") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $PASS_PKT ]]; then print_result "IPv6 packet not received" return 1 fi PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name -w $PCAP_FILE --rx-capture=entry,exit") $PING6 -W 2 -c 1 "$INSIDE_IP6" || (rm "$PCAP_FILE" >& /dev/null; return 1) RESULT=$(stop_background "$PID") || (print_result "xdpdump failed"; rm "$PCAP_FILE" >& /dev/null; return 1) RESULT=$(capinfos "$PCAP_FILE") || (print_result "capinfos failed"; rm "$PCAP_FILE" >& /dev/null; return 1) if ! [[ $RESULT =~ $INFOS_REGEX ]]; then echo "REGEX: $INFOS_REGEX" print_result "Failed capinfos content" rm "$PCAP_FILE" >& /dev/null return 1 fi if version_greater_or_equal "$TSHARK_VERSION" 3.6.7; then local ATTRIB_REGEX="^$NS:xdp_test_prog_with_a_long_name\(\)@fentry 0 1 $.*^$NS:xdp_test_prog_with_a_long_name\(\)@fexit 0 1 2$.*" RESULT=$(tshark -r "$PCAP_FILE" -T fields \ -e frame.interface_name \ -e frame.interface_queue \ -e frame.packet_id \ -e frame.verdict.ebpf_xdp) if ! [[ $RESULT =~ $ATTRIB_REGEX ]]; then print_result "Failed attributes content with Tshark $TSHARK_VERSION" rm "$PCAP_FILE" >& /dev/null return 1 fi fi rm "$PCAP_FILE" >& /dev/null $XDP_LOADER unload "$NS" --all || return 1 } test_capt_term() { skip_if_missing_kernel_symbol bpf_xdp_output_proto skip_if_missing_trace_attach local PASS_REGEX="(xdp_test_prog_with_a_long_name\(\)@entry: packet size 118 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+)" local PASS_X_REGEX="(xdp_test_prog_with_a_long_name\(\)@entry: packet size 118 bytes, captured 118 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+)" local PASS_X_OPT="0x0020: 00 00 00 00 00 02 fc 42 de ad ca fe" $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 $XDP_LOADER load "$NS" "$TEST_PROG_DIR/test_long_func_name.o" || return 1 PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $PASS_REGEX ]]; then print_result "IPv6 packet not received" return 1 fi PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name -x") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $PASS_X_REGEX ]]; then print_result "IPv6 packet not received[2]" return 1 fi # If the IP6 addresses remain the same this simple string compare can be # used to verify the -x output is present. if [[ "$RESULT" != *"$PASS_X_OPT"* ]]; then print_result "IPv6 HEX packet not received" return 1 fi $XDP_LOADER unload "$NS" --all || return 1 } test_exitentry() { skip_if_missing_kernel_symbol bpf_xdp_output_proto skip_if_missing_trace_attach local PASS_ENTRY_REGEX="(xdp_test_prog_with_a_long_name\(\)@entry: packet size 118 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+)" local PASS_EXIT_REGEX="(xdp_test_prog_with_a_long_name\(\)@exit\[PASS\]: packet size 118 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+)" local PASS_ENTRY_D_REGEX="(xdp_drop\(\)@entry: packet size 118 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+)" local PASS_EXIT_D_REGEX="(xdp_drop\(\)@exit\[DROP\]: packet size 118 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+)" local ID_ENTRY_REGEX="xdp_drop\(\)@entry: packet size 118 bytes on if_index [0-9]+, rx queue [0-9]+, id ([0-9]+)" local ID_EXIT_REGEX="xdp_drop\(\)@exit\[DROP\]: packet size 118 bytes on if_index [0-9]+, rx queue [0-9]+, id ([0-9]+)" $XDP_LOADER load "$NS" "$TEST_PROG_DIR/test_long_func_name.o" || return 1 PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name --rx-capture=entry") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $PASS_ENTRY_REGEX ]]; then print_result "IPv6 entry packet not received" return 1 fi PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name --rx-capture=exit") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $PASS_EXIT_REGEX ]]; then print_result "IPv6 exit packet not received" return 1 fi $XDP_LOADER unload "$NS" --all || return 1 $XDP_LOADER load "$NS" "$TEST_PROG_DIR/xdp_drop.o" || return 1 PID=$(start_tcpdump "$XDPDUMP -i $NS --rx-capture=exit") $PING6 -W 0.1 -c 1 "$INSIDE_IP6" # Note that this ping will fail!! RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $PASS_EXIT_D_REGEX ]]; then print_result "IPv6 drop exit packet not received" return 1 fi PID=$(start_tcpdump "$XDPDUMP -i $NS --rx-capture=exit,entry") $PING6 -W 0.1 -c 1 "$INSIDE_IP6" # Note that this ping will fail!! RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $PASS_EXIT_D_REGEX && $RESULT =~ $PASS_ENTRY_D_REGEX ]]; then print_result "IPv6 drop entry/exit packet not received" return 1 fi [[ $RESULT =~ $ID_ENTRY_REGEX ]] ENTRY_ID=${BASH_REMATCH[1]} [[ $RESULT =~ $ID_EXIT_REGEX ]] EXIT_ID=${BASH_REMATCH[1]} if [[ "$EXIT_ID" != "$ENTRY_ID" ]]; then print_result "Failed matching IDs" return 1 fi $XDP_LOADER unload "$NS" --all || return 1 } test_snap() { skip_if_missing_kernel_symbol bpf_xdp_output_proto skip_if_missing_trace_attach local PASS_REGEX="(xdp_test_prog_with_a_long_name\(\)@entry: packet size 118 bytes, captured 16 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+)" local PASS_II_REGEX="(xdp_test_prog_with_a_long_name\(\)@entry: packet size 118 bytes, captured 21 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+)" $XDP_LOADER load "$NS" "$TEST_PROG_DIR/test_long_func_name.o" || return 1 PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name -x --snapshot-length=16") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $PASS_REGEX ]]; then print_result "IPv6 packet fragment not received" return 1 fi PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name -x -s 21") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $PASS_II_REGEX ]]; then print_result "IPv6 packet fragment not received[2]" return 1 fi $XDP_LOADER unload "$NS" --all || return 1 } test_multi_pkt() { skip_if_missing_kernel_symbol bpf_xdp_output_proto skip_if_missing_trace_attach local PASS_ENTRY_REGEX="(xdp_test_prog_with_a_long_name\(\)@entry: packet size [0-9]+ bytes on if_index [0-9]+, rx queue [0-9]+, id 20000)" local PASS_EXIT_REGEX="(xdp_test_prog_with_a_long_name\(\)@exit\[PASS\]: packet size [0-9]+ bytes on if_index [0-9]+, rx queue [0-9]+, id 20000)" local PKT_SIZES=(56 512 1500) $XDP_LOADER load "$NS" "$TEST_PROG_DIR/test_long_func_name.o" || return 1 for PKT_SIZE in "${PKT_SIZES[@]}" ; do PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name --rx-capture=entry,exit") timeout 40 $PING6 -q -W 0.1 -s "$PKT_SIZE" -c 20000 -f "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $PASS_ENTRY_REGEX ]]; then print_result "IPv6 entry packet not received, $PKT_SIZE" return 1 fi if ! [[ $RESULT =~ $PASS_EXIT_REGEX ]]; then print_result "IPv6 exit packet not received, $PKT_SIZE" return 1 fi done $XDP_LOADER unload "$NS" --all || return 1 } test_perf_wakeup() { skip_if_missing_kernel_symbol bpf_xdp_output_proto skip_if_missing_trace_attach $XDPDUMP --help | grep -q -- "--perf-wakeup" if [ $? -eq 1 ]; then # No support for perf_wakeup, so return SKIP return "$SKIPPED_TEST" fi local PASS_REGEX="(xdp_test_prog_with_a_long_name\(\)@entry: packet size 118 bytes on if_index [0-9]+, rx queue [0-9]+)" local PASS_10K_REGEX="(xdp_test_prog_with_a_long_name\(\)@entry: packet size 118 bytes on if_index [0-9]+, rx queue [0-9]+, id 10000)" local WAKEUPS=(0 1 32 128) $XDP_LOADER load "$NS" "$TEST_PROG_DIR/test_long_func_name.o" || return 1 for WAKEUP in "${WAKEUPS[@]}" ; do # We send a single packet to make sure flushing of the buffer works! PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name --perf-wakeup=$WAKEUP") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $PASS_REGEX ]]; then print_result "IPv6 packet not received for wakeup $WAKEUP" return 1 fi # We sent 10k packets and see if the all arrive PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name --perf-wakeup=$WAKEUP") timeout 20 "$PING6" -q -W 2 -c 10000 -f "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $PASS_10K_REGEX ]]; then print_result "IPv6 10k packet not received for wakeup $WAKEUP" return 1 fi done $XDP_LOADER unload "$NS" --all || return 1 } test_none_xdp() { local PASS_PKT="packet size 118 bytes on if_name \"$NS\"" local WARN_MSG="WARNING: Specified interface does not have an XDP program loaded," $XDP_LOADER unload "$NS" --all PID=$(start_tcpdump "$XDPDUMP -i $NS") $PING6 -i 0.1 -W 2 -c 4 "$INSIDE_IP6" || return 1 sleep 1 RESULT=$(stop_background "$PID") if [[ "$RESULT" != *"$PASS_PKT"* ]]; then print_result "IPv6 packet not received" return 1 fi if [[ "$RESULT" != *"$WARN_MSG"* ]]; then print_result "Missing warning message" return 1 fi } test_promiscuous_selfload() { local PASS_PKT="packet size 118 bytes on if_name \"$NS\"" local PASS_REGEX="(xdp_test_prog_with_a_long_name\(\)@entry: packet size 118 bytes, captured 118 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+)" $XDP_LOADER unload "$NS" --all dmesg -C PID=$(start_tcpdump "$XDPDUMP -i $NS -P") $PING6 -i 0.1 -W 2 -c 4 "$INSIDE_IP6" || return 1 sleep 1 RESULT=$(stop_background "$PID") if [[ "$RESULT" != *"$PASS_PKT"* ]]; then print_result "IPv6 packet not received [legacy mode]" return 1 fi RESULT=$(dmesg) if [[ "$RESULT" != *"device $NS entered promiscuous mode"* ]] && [[ "$RESULT" != *"$NS: entered promiscuous mode"* ]]; then print_result "Failed enabling promiscuous mode on legacy interface" return 1 fi if [[ "$RESULT" != *"device $NS left promiscuous mode"* ]] && [[ "$RESULT" != *"$NS: left promiscuous mode"* ]]; then print_result "Failed disabling promiscuous mode on legacy interface" return 1 fi } test_promiscuous_preload() { skip_if_missing_kernel_symbol bpf_xdp_output skip_if_missing_trace_attach local PASS_REGEX="(xdp_test_prog_with_a_long_name\(\)@entry: packet size 118 bytes, captured 118 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+)" $XDP_LOADER load "$NS" "$TEST_PROG_DIR/test_long_func_name.o" || return 1 dmesg -C PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name -x --promiscuous-mode") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $PASS_REGEX ]]; then print_result "IPv6 packet not received" return 1 fi RESULT=$(dmesg) if [[ "$RESULT" != *"device $NS entered promiscuous mode"* ]] && [[ "$RESULT" != *"$NS: entered promiscuous mode"* ]]; then print_result "Failed enabling promiscuous mode on interface" return 1 fi if [[ "$RESULT" != *"device $NS left promiscuous mode"* ]] && [[ "$RESULT" != *"$NS: left promiscuous mode"* ]]; then print_result "Failed disabling promiscuous mode on interface" return 1 fi } test_pname_parse() { skip_if_legacy_fallback local PASS_REGEX="(xdp_test_prog_with_a_long_name\(\)@entry: packet size 118 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+)" local PROG_ID_1=0 local PROG_ID_2=0 local PROG_ID_3=0 local PROG_ID_4=0 $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 # Here we load the programs without the xdp-tools loader to make sure # they are not loaded as a multi-program. $TEST_PROG_DIR/test-tool load -m skb "$NS" "$TEST_PROG_DIR/test_long_func_name.o" # We need to specify the function name or else it should fail PID=$(start_background "$XDPDUMP -i $NS") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if [[ $RESULT != *"ERROR: Can't identify the full XDP main function!"* ]]; then print_result "xdpdump should fail with duplicate function!" return 1 fi # Here we specify the correct function name so we should get the packet PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $PASS_REGEX ]]; then print_result "IPv6 packet not received" return 1 fi # Here we specify the wrong correct function name so we should not get the packet PID=$(start_background "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name_too") RESULT=$(stop_background "$PID") if [[ $RESULT != *"ERROR: Can't load eBPF object:"* ]]; then print_result "xdpdump should fail being unable to attach!" return 1 fi # Here we specify an non-existing function PID=$(start_background "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_non_existing_name") RESULT=$(stop_background "$PID") if [[ $RESULT != *"ERROR: Can't find function 'xdp_test_prog_with_a_long_non_existing_name' on interface!"* ]]; then print_result "xdpdump should fail with unknown function!" return 1 fi # Verify invalid program indexes PID=$(start_background "$XDPDUMP -i $NS -p hallo@3e") RESULT=$(stop_background "$PID") if [[ $RESULT != *"ERROR: Can't extract valid program id from \"hallo@3e\"!"* ]]; then print_result "xdpdump should fail with id value error!" return 1 fi PID=$(start_background "$XDPDUMP -i $NS -p hallo@128") RESULT=$(stop_background "$PID") if [[ $RESULT != *"ERROR: Invalid program id supplied, \"hallo@128\"!"* ]]; then print_result "xdpdump should fail with invalid id!" return 1 fi # Remove loaded program ip link set dev "$NS" xdpgeneric off # Now test actual multi-program parsing (negative test cases) $XDP_LOADER unload "$NS" --all $XDP_LOADER load "$NS" "$TEST_PROG_DIR/test_long_func_name.o" "$TEST_PROG_DIR/xdp_pass.o" "$TEST_PROG_DIR/xdp_drop.o" PID=$(start_background "$XDPDUMP -D") RESULT=$(stop_background "$PID") PROG_ID_1=$(echo "$RESULT" | grep "$NS" -A4 | awk '{print $4}' | sed -n 1p | tr -d ' ') PROG_ID_2=$(echo "$RESULT" | grep "$NS" -A4 | awk '{print $4}' | sed -n 2p | tr -d ' ') PROG_ID_3=$(echo "$RESULT" | grep "$NS" -A4 | awk '{print $4}' | sed -n 3p | tr -d ' ') PROG_ID_4=$(echo "$RESULT" | grep "$NS" -A4 | awk '{print $4}' | sed -n 4p | tr -d ' ') PID=$(start_background "$XDPDUMP -i $NS -p all") RESULT=$(stop_background "$PID") if [[ $RESULT != *"ERROR: Can't identify the full XDP 'xdp_test_prog_w' function in program $PROG_ID_2!"* || $RESULT != *"xdp_test_prog_with_a_long_name@$PROG_ID_2"* || $RESULT != *"xdp_test_prog_with_a_long_name_too@$PROG_ID_2"* || $RESULT != *"Command line to replace 'all':"* || $RESULT != *"xdp_dispatcher@$PROG_ID_1,@$PROG_ID_2,xdp_pass@$PROG_ID_3,xdp_drop@$PROG_ID_4"* ]]; then print_result "xdpdump should fail with all list!" return 1 fi PID=$(start_background "$XDPDUMP -i $NS -p hallo@$PROG_ID_1") RESULT=$(stop_background "$PID") if [[ $RESULT != *"ERROR: Can't find function 'hallo' in interface program $PROG_ID_1!"* ]]; then print_result "xdpdump should fail with hallo not found on program $PROG_ID_1!" return 1 fi PID=$(start_background "$XDPDUMP -i $NS -p hallo") RESULT=$(stop_background "$PID") if [[ $RESULT != *"ERROR: Can't find function 'hallo' on interface"* ]]; then print_result "xdpdump should fail hallo not found!" return 1 fi PID=$(start_background "$XDPDUMP -i $NS -p xdp_test_prog_w") RESULT=$(stop_background "$PID") if [[ $RESULT != *"ERROR: Can't identify the full XDP 'xdp_test_prog_w' function!"* || $RESULT != *"xdp_test_prog_with_a_long_name_too"* ]]; then print_result "xdpdump should fail can't id xdp_test_prog_w!" return 1 fi PID=$(start_background "$XDPDUMP -i $NS -p xdp_test_prog_w@$PROG_ID_2") RESULT=$(stop_background "$PID") if [[ $RESULT != *"ERROR: Can't identify the full XDP 'xdp_test_prog_w' function in program $PROG_ID_2!"* || $RESULT != *"xdp_test_prog_with_a_long_name_too@$PROG_ID_2"* ]]; then print_result "xdpdump should fail can't id xdp_test_prog_w@$PROG_ID_2!" return 1 fi # Now load XDP programs with duplicate functions $XDP_LOADER unload "$NS" --all $XDP_LOADER load "$NS" "$TEST_PROG_DIR/test_long_func_name.o" "$TEST_PROG_DIR/test_long_func_name.o" "$TEST_PROG_DIR/xdp_pass.o" "$TEST_PROG_DIR/xdp_drop.o" PID=$(start_background "$XDPDUMP -D") RESULT=$(stop_background "$PID") PROG_ID_1=$(echo "$RESULT" | grep "$NS" -A2 | awk '{print $4}' | sed -n 1p | tr -d ' ') PROG_ID_2=$(echo "$RESULT" | grep "$NS" -A2 | awk '{print $4}' | sed -n 2p | tr -d ' ') PROG_ID_3=$(echo "$RESULT" | grep "$NS" -A2 | awk '{print $4}' | sed -n 2p | tr -d ' ') PID=$(start_background "$XDPDUMP -i $NS -p xdp_test_prog_with_a_long_name") RESULT=$(stop_background "$PID") if [[ $RESULT != *"ERROR: The function 'xdp_test_prog_with_a_long_name' exists in multiple programs!"* || $RESULT != *"xdp_test_prog_with_a_long_name@$PROG_ID_2"* || $RESULT != *"xdp_test_prog_with_a_long_name@$PROG_ID_3"* ]]; then print_result "xdpdump should fail with duplicate function!" return 1 fi $XDP_LOADER unload "$NS" --all return 0 } test_multi_prog() { skip_if_legacy_fallback skip_if_missing_trace_attach local ENTRY_REGEX="(xdp_dispatcher\(\)@entry: packet size 118 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+).*(xdp_pass\(\)@entry: packet size 118 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+)" local EXIT_REGEX="(xdp_pass\(\)@exit\[PASS\]: packet size 118 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+).*(xdp_dispatcher\(\)@exit\[PASS\]: packet size 118 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+)" local PROG_ID_1=0 local PROG_ID_4=0 $XDP_LOADER load "$NS" "$TEST_PROG_DIR/xdp_pass.o" "$TEST_PROG_DIR/test_long_func_name.o" "$TEST_PROG_DIR/xdp_pass.o" PID=$(start_background "$XDPDUMP -D") RESULT=$(stop_background "$PID") PROG_ID_1=$(echo "$RESULT" | grep "$NS" -A4 | awk '{print $4}' | sed -n 1p | tr -d ' ') PROG_ID_4=$(echo "$RESULT" | grep "$NS" -A4 | awk '{print $4}' | sed -n 4p | tr -d ' ') PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_dispatcher,xdp_pass@$PROG_ID_4 -vv") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if [[ $RESULT == *"Unrecognized arg#0 type PTR"* ]]; then $XDP_LOADER unload "$NS" --all return $SKIPPED_TEST fi if ! [[ $RESULT =~ $ENTRY_REGEX ]]; then print_result "Not received all fentry packets" return 1 fi PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_dispatcher,xdp_pass@$PROG_ID_4 --rx-capture=exit") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $EXIT_REGEX ]]; then print_result "Not received all fexit packets" return 1 fi PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_dispatcher,xdp_pass@$PROG_ID_4 --rx-capture=exit,entry") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $ENTRY_REGEX ]]; then print_result "Not received all fentry packets on entry/exit test" return 1 fi if ! [[ $RESULT =~ $EXIT_REGEX ]]; then print_result "Not received all fexit packets on entry/exit test" return 1 fi PID=$(start_tcpdump "$XDPDUMP -i $NS -p $PROG_ID_1,$PROG_ID_4 --rx-capture=exit,entry") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $ENTRY_REGEX ]]; then print_result "[IDs]Not received all fentry packets on entry/exit test" return 1 fi if ! [[ $RESULT =~ $EXIT_REGEX ]]; then print_result "[IDs]Not received all fexit packets on entry/exit test" return 1 fi PID=$(start_tcpdump "$XDPDUMP -i $NS -p xdp_dispatcher,$PROG_ID_4 --rx-capture=exit,entry") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if ! [[ $RESULT =~ $ENTRY_REGEX ]]; then print_result "[Mix]Not received all fentry packets on entry/exit test" return 1 fi if ! [[ $RESULT =~ $EXIT_REGEX ]]; then print_result "[Mix]Not received all fexit packets on entry/exit test" return 1 fi $XDP_LOADER unload "$NS" --all return 0 } test_xdp_load() { local PASS_REGEX="(xdpdump\(\)@entry: packet size 118 bytes on if_index [0-9]+, rx queue [0-9]+, id [0-9]+)" local WARN_MSG="Will load a capture only XDP program!" PID=$(start_tcpdump "$XDPDUMP -i $NS --load-xdp-program") $PING6 -W 2 -c 1 "$INSIDE_IP6" || return 1 RESULT=$(stop_background "$PID") if [[ "$RESULT" != *"$WARN_MSG"* ]]; then print_result "Missing warning message" return 1 fi if ! [[ $RESULT =~ $PASS_REGEX ]]; then print_result "IPv6 packet not received" return 1 fi } cleanup_tests() { $XDP_LOADER unload "$NS" --all >/dev/null 2>&1 } xdp-tools-1.6.1/xdp-dump/xdpdump.8000066400000000000000000000253311514310632100167600ustar00rootroot00000000000000.TH "xdpdump" "8" "JANUARY 13, 2021" "V1.6.1" "a simple tcpdump like tool for capturing packets at the XDP layer" .SH "NAME" xdpdump \- a simple tcpdump like tool for capturing packets at the XDP layer .SH "SYNOPSIS" .PP \fIxdpdump\fP is a simple XDP packet capture tool that tries to behave similar to \fItcpdump\fP, however, it has no packet filter or decode capabilities. .PP This can be used for debugging XDP programs that are already loaded on an interface. Packets can be dumped/inspected before on \fBentry\fP to XDP program, or after at \fBexit\fP from an XDP program. Furthermore, at \fBexit\fP the XDP action is also captured. This means that even packets that are dropped at the XDP layer can be captured via this tool. .PP \fIxdpdump\fP works by attaching a bpf trace program to the XDP entry and/or exit function which stores the raw packet in a perf trace buffer. If no XDP program is loaded this approach can not be used and the tool will use a libpcap live-capture to be backward compatible. .SS "Running xdpdump" .PP The syntax for running \fIxdpdump\fP is: .RS .nf \fCUsage: xdpdump [options] XDPDump tool to dump network traffic Options: --rx-capture Capture point for the rx direction (valid values: entry,exit) -D, --list-interfaces Print the list of available interfaces -i, --interface Name of interface to capture on --perf-wakeup Wake up xdpdump every packets -p, --program-names Specific program to attach to -s, --snapshot-length Minimum bytes of packet to capture --use-pcap Use legacy pcap format for XDP traces -w, --write Write raw packets to pcap file -x, --hex Print the full packet in hex -v, --verbose Enable verbose logging (-vv: more verbose) --version Display version information -h, --help Show this help \fP .fi .RE .SH "The options explained" .PP The \fIxdpdump\fP tool tries to mimic the basic \fItcpdump\fP options, but just in case below each of the available options is explained: .SS "--rx-capture " .PP Specify where the ingress packet gets captured. Either at the entry of the XDP program and/or exit of the XDP program. Valid options are \fBentry\fP, \fBexit\fP, or both \fBentry,exit\fP. The packet at \fBexit\fP can be modified by the XDP program. If you are interested to see both the original and modified packet, use the \fBentry,exit\fP option. With this, each packet is captured twice. The default value for this is \fBentry\fP. .SS "-D, --list-interfaces" .PP Display a list of available interfaces and any XDP program loaded .SS "--load-xdp-mode" .PP Specifies which loader mode to use with the \fI\-\-load\-xdp\-program\fP option. The valid values are ‘native’, which is the default in-driver XDP mode, ‘skb’, which causes the so-called skb mode (also known as generic XDP) to be used, ‘hw’ which causes the program to be offloaded to the hardware, or ‘unspecified’ which leaves it up to the kernel to pick a mode (which it will do by picking native mode if the driver supports it, or generic mode otherwise). Note that using ‘unspecified’ can make it difficult to predict what mode a program will end up being loaded in. For this reason, the default is ‘native’. .SS "--load-xdp-program" .PP If no XDP program is loaded on the interface, by default, xdpdump will fallback to libpcap's live capture mode to capture the packets. Alternatively, with this option, you can ask xdpdump to load an XDP program to capture the packets directly. .SS "-i, --interface " .PP Listen on interface \fIifname\fP. Note that if no XDP program is loaded on the interface it will use libpcap's live capture mode to capture the packets. .SS "--perf-wakeup " .PP Let the Kernel wake up \fIxdpdump\fP once for every \fI\fP being posted in the perf ring buffer. The higher the number the less the impact is on the actual XDP program. The default value is 0, which automatically calculates the value based on the available CPUs/buffers. Use -v to see the actual used value. .SS "-p, --program-names [|all]" .PP This option allows you to capture packets for a specific, set of, or all XDP programs loaded on the interface. You can either specify the actual program names or program IDs separated by commas. In the case where multiple programs are attached with the same name, you should use the program ID. Use the -D option to see the loaded programs and their IDs. .PP In addition, the Linux API does not provide the full name of the attached eBPF entry function if it's longer than 15 characters. xdpdump will try to guess the correct function name from the available BTF debug information. However, if multiple functions exist with the same leading name, it can not pick the correct one. It will dump the available functions, and you can choose the correct one, and supply it with this option. If you have programs with duplicate long names, you also need to specify the program ID with the full name. This can be done by adding the id to the name with the \fI@\fP suffix. .SS "-P, --promiscuous-mode" .PP This option puts the interface into promiscuous mode. .SS "-s, --snapshot-length " .PP Capture \fBsnaplen\fP bytes of a packet rather than the default 262144 bytes. .SS "--use-pcap" .PP Use legacy pcap format for XDP traces. By default, it will use the PcapNG format so that it can store various metadata. .SS "-w, --write " .PP Write the raw packets to a pcap file rather than printing them out hexadecimal. Standard output is used if \fBfile\fP is \fI\-\fP. .SS "-x, --hex" .PP When dumping packets on the console also print the full packet content in hex. .SS "-v, --verbose" .PP Enable debug logging. Specify twice for even more verbosity. .SS "--version" .PP Display \fIxpdump\fP version information and exit. .SS "-h, --help" .PP Display a summary of the available options .SH "Examples" .PP The below will load the \fIxdp\-filter\fP program on eth0, but it does not do any actual filtering: .RS .nf \fC# xdp-filter load --mode skb eth0 # # xdpdump -D Interface Prio Program name Mode ID Tag Chain actions -------------------------------------------------------------------------------------- lo eth0 xdp_dispatcher skb 10651 d51e469e988d81da => 10 xdpfilt_alw_all 10669 0b394f43ab24501c XDP_PASS \fP .fi .RE .PP Now we can try \fIxdpdump\fP: .RS .nf \fC# xdpdump -i eth0 -x listening on eth0, ingress XDP program ID 10651 func xdp_dispatcher, capture mode entry, capture size 262144 bytes 1584373839.460733895: xdp_dispatcher()@entry: packet size 102 bytes, captured 102 bytes on if_index 2, rx queue 0, id 1 0x0000: 52 54 00 db 44 b6 52 54 00 34 38 da 08 00 45 48 RT..D.RT.48...EH 0x0010: 00 58 d7 dd 40 00 40 06 ec c3 c0 a8 7a 01 c0 a8 .X..@.@.....z... 0x0020: 7a 64 9c de 00 16 0d d5 c6 bc 46 c9 bb 11 80 18 zd........F..... 0x0030: 01 f5 7b b4 00 00 01 01 08 0a 77 0a 8c b8 40 12 ..{.......w...@. 0x0040: cc a6 00 00 00 10 54 ce 6e 20 c3 e7 da 6c 08 42 ......T.n ...l.B 0x0050: d6 d9 ee 42 42 f0 82 c9 4f 12 ed 7b 19 ab 22 0d ...BB...O..{..". 0x0060: 09 29 a9 ee df 89 .).... 1584373839.462340808: xdp_dispatcher()@entry: packet size 66 bytes, captured 66 bytes on if_index 2, rx queue 0, id 2 0x0000: 52 54 00 db 44 b6 52 54 00 34 38 da 08 00 45 48 RT..D.RT.48...EH 0x0010: 00 34 d7 de 40 00 40 06 ec e6 c0 a8 7a 01 c0 a8 .4..@.@.....z... 0x0020: 7a 64 9c de 00 16 0d d5 c6 e0 46 c9 bc 85 80 10 zd........F..... 0x0030: 01 f5 74 0c 00 00 01 01 08 0a 77 0a 8c ba 40 12 ..t.......w...@. 0x0040: d2 34 .4 ^C 2 packets captured 0 packets dropped by perf ring \fP .fi .RE .PP Below are two more examples redirecting the capture file to \fItcpdump\fP or \fItshark\fP: .RS .nf \fC# xdpdump -i eth0 -w - | tcpdump -r - -n listening on eth0, ingress XDP program ID 10651 func xdp_dispatcher, capture mode entry, capture size 262144 bytes reading from file -, link-type EN10MB (Ethernet) 15:55:09.075887 IP 192.168.122.1.40928 > 192.168.122.100.ssh: Flags [P.], seq 3857553815:3857553851, ack 3306438882, win 501, options [nop,nop,TS val 1997449167 ecr 1075234328], length 36 15:55:09.077756 IP 192.168.122.1.40928 > 192.168.122.100.ssh: Flags [.], ack 37, win 501, options [nop,nop,TS val 1997449169 ecr 1075244363], length 0 15:55:09.750230 IP 192.168.122.1.40928 > 192.168.122.100.ssh: Flags [P.], seq 36:72, ack 37, win 501, options [nop,nop,TS val 1997449842 ecr 1075244363], length 36 \fP .fi .RE .RS .nf \fC# xdpdump -i eth0 -w - | tshark -r - -n listening on eth0, ingress XDP program ID 10651 func xdp_dispatcher, capture mode entry, capture size 262144 bytes 1 0.000000 192.168.122.1 → 192.168.122.100 SSH 102 Client: Encrypted packet (len=36) 2 0.000646 192.168.122.1 → 192.168.122.100 TCP 66 40158 → 22 [ACK] Seq=37 Ack=37 Win=1467 Len=0 TSval=1997621571 TSecr=1075416765 3 12.218164 192.168.122.1 → 192.168.122.100 SSH 102 Client: Encrypted packet (len=36) \fP .fi .RE .PP One final example capturing specific XDP programs loaded on the interface: .RS .nf \fC# xdpdump -D Interface Prio Program name Mode ID Tag Chain actions -------------------------------------------------------------------------------------- lo eth0 xdp_dispatcher skb 10558 d51e469e988d81da => 5 xdp_test_prog_w 10576 b5a46c6e9935298c XDP_PASS => 10 xdp_pass 10582 3b185187f1855c4c XDP_PASS => 10 xdp_pass 10587 3b185187f1855c4c XDP_PASS \fP .fi .RE .PP We would like to see the packets on the \fIxdp_dispatcher()\fP and the 2nd \fIxdp_pass()\fP program: .RS .nf \fC# xdpdump -i eth0 --rx-capture=entry,exit -p xdp_dispatcher,xdp_pass@10587 or # xdpdump -i eth0 --rx-capture=entry,exit -p 10558,10587 listening on eth0, ingress XDP program ID 10558 func xdp_dispatcher, ID 10587 func xdp_pass, capture mode entry/exit, capture size 262144 bytes 1607694215.501287259: xdp_dispatcher()@entry: packet size 102 bytes on if_index 2, rx queue 0, id 1 1607694215.501371504: xdp_pass()@entry: packet size 102 bytes on if_index 2, rx queue 0, id 1 1607694215.501383099: xdp_pass()@exit[PASS]: packet size 102 bytes on if_index 2, rx queue 0, id 1 1607694215.501394709: xdp_dispatcher()@exit[PASS]: packet size 102 bytes on if_index 2, rx queue 0, id 1 ^C 4 packets captured 0 packets dropped by perf ring \fP .fi .RE .SH "BUGS" .PP Please report any bugs on Github: \fIhttps://github.com/xdp-project/xdp-tools/issues\fP .SH "AUTHOR" .PP \fIxdpdump\fP was written by Eelco Chaudron xdp-tools-1.6.1/xdp-dump/xdpdump.c000066400000000000000000001560531514310632100170410ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ /***************************************************************************** * Include files *****************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define PCAP_DONT_INCLUDE_PCAP_BPF_H #include #include #include #include #include #include #include #include "logging.h" #include "params.h" #include "util.h" #include "xdpdump.h" #include "xpcapng.h" #include "compat.h" /***************************************************************************** * Local definitions and global variables *****************************************************************************/ #define PROG_NAME "xdpdump" #define DEFAULT_SNAP_LEN 262144 #ifndef ENOTSUPP #define ENOTSUPP 524 /* Operation is not supported */ #endif #define RX_FLAG_FENTRY (1<<0) #define RX_FLAG_FEXIT (1<<1) struct flag_val rx_capture_flags[] = { {"entry", RX_FLAG_FENTRY}, {"exit", RX_FLAG_FEXIT}, {} }; struct enum_val xdp_modes[] = { {"native", XDP_MODE_NATIVE}, {"skb", XDP_MODE_SKB}, {"hw", XDP_MODE_HW}, {"unspecified", XDP_MODE_UNSPEC}, {NULL, 0} }; static const struct dumpopt { bool hex_dump; bool list_interfaces; bool load_xdp; bool promiscuous; bool use_pcap; struct iface iface; uint32_t perf_wakeup; uint32_t snaplen; char *pcap_file; char *program_names; unsigned int load_xdp_mode; unsigned int rx_capture; } defaults_dumpopt = { .hex_dump = false, .list_interfaces = false, .load_xdp = false, .promiscuous = false, .use_pcap = false, .snaplen = DEFAULT_SNAP_LEN, .load_xdp_mode = XDP_MODE_NATIVE, .rx_capture = RX_FLAG_FENTRY, }; struct dumpopt cfg_dumpopt; static struct prog_option xdpdump_options[] = { DEFINE_OPTION("rx-capture", OPT_FLAGS, struct dumpopt, rx_capture, .metavar = "", .typearg = rx_capture_flags, .help = "Capture point for the rx direction"), DEFINE_OPTION("list-interfaces", OPT_BOOL, struct dumpopt, list_interfaces, .short_opt = 'D', .help = "Print the list of available interfaces"), DEFINE_OPTION("load-xdp-mode", OPT_ENUM, struct dumpopt, load_xdp_mode, .typearg = xdp_modes, .metavar = "", .help = "Mode used for --load-xdp-mode, default native"), DEFINE_OPTION("load-xdp-program", OPT_BOOL, struct dumpopt, load_xdp, .help = "Load XDP trace program if no XDP program is loaded"), DEFINE_OPTION("interface", OPT_IFNAME, struct dumpopt, iface, .short_opt = 'i', .metavar = "", .help = "Name of interface to capture on"), #ifdef HAVE_LIBBPF_PERF_BUFFER__CONSUME DEFINE_OPTION("perf-wakeup", OPT_U32, struct dumpopt, perf_wakeup, .metavar = "", .help = "Wake up xdpdump every packets"), #endif DEFINE_OPTION("program-names", OPT_STRING, struct dumpopt, program_names, .short_opt = 'p', .metavar = "", .help = "Specific program to attach to"), DEFINE_OPTION("promiscuous-mode", OPT_BOOL, struct dumpopt, promiscuous, .short_opt = 'P', .help = "Open interface in promiscuous mode"), DEFINE_OPTION("snapshot-length", OPT_U32, struct dumpopt, snaplen, .short_opt = 's', .metavar = "", .help = "Minimum bytes of packet to capture"), DEFINE_OPTION("use-pcap", OPT_BOOL, struct dumpopt, use_pcap, .help = "Use legacy pcap format for XDP traces"), DEFINE_OPTION("write", OPT_STRING, struct dumpopt, pcap_file, .short_opt = 'w', .metavar = "", .help = "Write raw packets to pcap file"), DEFINE_OPTION("hex", OPT_BOOL, struct dumpopt, hex_dump, .short_opt = 'x', .help = "Print the full packet in hex"), END_OPTIONS }; #define MAX_LOADED_XDP_PROGRAMS (MAX_DISPATCHER_ACTIONS + 1) struct capture_programs { /* Contains a list of programs to capture on, with the respective * program names. The order MUST be the same as the loaded order! */ unsigned int nr_of_progs; struct prog_info { struct xdp_program *prog; const char *func; unsigned int rx_capture; /* Fields used by the actual loader. */ bool attached; int perf_map_fd; struct bpf_object *prog_obj; struct bpf_link *fentry_link; struct bpf_link *fexit_link; } progs[MAX_LOADED_XDP_PROGRAMS]; }; struct perf_handler_ctx { uint64_t missed_events; uint64_t last_missed_events; uint64_t captured_packets; uint64_t epoch_delta; uint64_t packet_id; uint64_t cpu_packet_id[MAX_CPUS]; struct dumpopt *cfg; struct capture_programs *xdp_progs; pcap_t *pcap; pcap_dumper_t *pcap_dumper; struct xpcapng_dumper *pcapng_dumper; }; bool exit_xdpdump; pcap_t *exit_pcap; /***************************************************************************** * get_if_speed() *****************************************************************************/ static uint64_t get_if_speed(struct iface *iface) { #define MAX_MODE_MASKS 10 int fd; struct ifreq ifr; struct { struct ethtool_link_settings req; uint32_t modes[3 * MAX_MODE_MASKS]; } ereq; if (iface == NULL) return 0; /* Open socket, and initialize structures. */ fd = socket(AF_INET, SOCK_DGRAM, 0); if (fd < 0) return 0; memset(&ereq, 0, sizeof(ereq)); ereq.req.cmd = ETHTOOL_GLINKSETTINGS; memset(&ifr, 0, sizeof(ifr)); strncpy(ifr.ifr_name, iface->ifname, sizeof(ifr.ifr_name) - 1); ifr.ifr_data = (void *)&ereq; /* First query the kernel to see how many masks we need to ask for. */ if (ioctl(fd, SIOCETHTOOL, &ifr) != 0) goto error_exit; if (ereq.req.link_mode_masks_nwords >= 0 || ereq.req.link_mode_masks_nwords < -MAX_MODE_MASKS || ereq.req.cmd != ETHTOOL_GLINKSETTINGS) goto error_exit; /* Now ask for the data set, and extract the speed in bps. */ ereq.req.link_mode_masks_nwords = -ereq.req.link_mode_masks_nwords; if (ioctl(fd, SIOCETHTOOL, &ifr) != 0) goto error_exit; /* If speed is unknown return 0. */ if (ereq.req.speed == -1U) ereq.req.speed = 0; close(fd); return ereq.req.speed * 1000000ULL; error_exit: close(fd); return 0; } /***************************************************************************** * get_if_drv_info() *****************************************************************************/ static char *get_if_drv_info(struct iface *iface, char *buffer, size_t len) { int fd; char *r_buffer = NULL; struct ifreq ifr; struct ethtool_drvinfo info; if (iface == NULL || buffer == NULL || len == 0) return NULL; fd = socket(AF_INET, SOCK_DGRAM, 0); if (fd < 0) return NULL; memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GDRVINFO; memset(&ifr, 0, sizeof(ifr)); strncpy(ifr.ifr_name, iface->ifname, sizeof(ifr.ifr_name) - 1); ifr.ifr_data = (void *)&info; if (ioctl(fd, SIOCETHTOOL, &ifr) != 0) goto exit; if (try_snprintf(buffer, len, "driver: \"%s\", version: \"%s\", " "fw-version: \"%s\", rom-version: \"%s\", " "bus-info: \"%s\"", info.driver, info.version, info.fw_version, info.erom_version, info.bus_info)) goto exit; r_buffer = buffer; exit: close(fd); return r_buffer; } /***************************************************************************** * set_if_promiscuous_mode() *****************************************************************************/ static int set_if_promiscuous_mode(struct iface *iface, bool enable, bool *did_enable) { int fd; int rc = 0; struct ifreq ifr; if (iface == NULL) return -EINVAL; fd = socket(AF_INET, SOCK_DGRAM, 0); if (fd < 0) return -errno; memset(&ifr, 0, sizeof(ifr)); strncpy(ifr.ifr_name, iface->ifname, sizeof(ifr.ifr_name) - 1); if (ioctl(fd, SIOCGIFFLAGS, &ifr) != 0) { pr_debug("DBG: Failed getting promiscuous mode: %s\n", strerror(errno)); rc = -errno; goto exit; } if (((ifr.ifr_flags & IFF_PROMISC) && enable) || (!(ifr.ifr_flags & IFF_PROMISC) && !enable)) { pr_debug("DBG: Promiscuous mode already %s!\n", enable ? "on" : "off"); goto exit; } if (enable) ifr.ifr_flags |= IFF_PROMISC; else ifr.ifr_flags &= ~IFF_PROMISC; if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) { pr_debug("DBG: Failed setting promiscuous mode %s: %s\n", enable ? "on" : "off", strerror(errno)); rc = -errno; goto exit; } if (did_enable) { if (enable) *did_enable = true; else *did_enable = false; } exit: close(fd); return rc; } /***************************************************************************** * get_xdp_return_string() *****************************************************************************/ static const char *get_xdp_action_string(enum xdp_action act) { switch (act) { case XDP_ABORTED: return "[ABORTED]"; case XDP_DROP: return "[DROP]"; case XDP_PASS: return "[PASS]"; case XDP_TX: return "[TX]"; case XDP_REDIRECT: return "[REDIRECT]"; } return "[*unknown*]"; } /***************************************************************************** * get_capture_mode_string() *****************************************************************************/ static const char *get_capture_mode_string(unsigned int mode) { switch(mode) { case RX_FLAG_FENTRY: return "entry"; case RX_FLAG_FEXIT: return "exit"; case RX_FLAG_FENTRY | RX_FLAG_FEXIT: return "entry/exit"; } return "unknown"; } /***************************************************************************** * snprinth() *****************************************************************************/ #define SNPRINTH_MIN_BUFFER_SIZE sizeof("0xffff: 00 11 22 33 44 55 66 77 88" \ " 99 aa bb cc dd ee ff " \ "................0") static int snprinth(char *str, size_t size, const uint8_t *buffer, size_t buffer_size, size_t offset) { int i; int pre_skip; int post_skip; size_t zero_offset; if (str == NULL || size < SNPRINTH_MIN_BUFFER_SIZE || buffer == NULL || offset >= buffer_size || buffer_size > 0xffff) return -EINVAL; zero_offset = offset & ~0xf; pre_skip = offset & 0xf; post_skip = (zero_offset + 0xf) < buffer_size ? \ 0 : 16 - (buffer_size - zero_offset); /* Print offset */ snprintf(str, size, "0x%04zx: ", offset & 0xfff0); str += 9; /* Print hex values */ if (pre_skip) { memset(str, ' ', pre_skip * 3); str[pre_skip * 3] = 0; } for (i = pre_skip; i < 16 - post_skip; i++) { snprintf(str + (i * 3), 5, "%02x ", buffer[zero_offset + i]); } if (post_skip) { memset(str + (i * 3), ' ', post_skip * 3); str[(i * 3) + (post_skip * 3)] = 0; } /* Print printable chars */ str += 16 * 3; *str++ = ' '; if (pre_skip) { memset(str, ' ', pre_skip); str[pre_skip] = 0; } for (i = pre_skip; i < 16 - post_skip; i++) str[i] = isprint(buffer[zero_offset + i]) ? \ buffer[zero_offset + i]: '.'; str[i] = 0; return 0; } /***************************************************************************** * handle_perf_event() *****************************************************************************/ static enum bpf_perf_event_ret handle_perf_event(void *private_data, int cpu, struct perf_event_header *event) { uint64_t ts; bool fexit; unsigned int if_idx, prog_idx; const char *xdp_func; struct perf_handler_ctx *ctx = private_data; struct perf_sample_event *e = container_of(event, struct perf_sample_event, header); struct perf_lost_event *lost = container_of(event, struct perf_lost_event, header); switch(e->header.type) { case PERF_RECORD_SAMPLE: if (cpu >= MAX_CPUS || e->header.size < sizeof(struct perf_sample_event) || e->size < (sizeof(struct pkt_trace_metadata) + e->metadata.cap_len) || e->metadata.prog_index >= ctx->xdp_progs->nr_of_progs) return LIBBPF_PERF_EVENT_CONT; fexit = e->metadata.flags & MDF_DIRECTION_FEXIT; prog_idx = e->metadata.prog_index; if_idx = prog_idx * 2 + (fexit ? 1 : 0); xdp_func = ctx->xdp_progs->progs[prog_idx].func; if (prog_idx == 0 && (!fexit || ctx->xdp_progs->progs[prog_idx].rx_capture == RX_FLAG_FEXIT)) ctx->cpu_packet_id[cpu] = ++ctx->packet_id; ts = e->time + ctx->epoch_delta; if (ctx->pcapng_dumper) { struct xpcapng_epb_options_s options = {}; int64_t action = e->metadata.action; uint32_t queue = e->metadata.rx_queue; options.flags = PCAPNG_EPB_FLAG_INBOUND; options.dropcount = ctx->last_missed_events; options.packetid = &ctx->cpu_packet_id[cpu]; options.queue = &queue; options.xdp_verdict = fexit ? &action : NULL; xpcapng_dump_enhanced_pkt(ctx->pcapng_dumper, if_idx, e->packet, e->metadata.pkt_len, min(e->metadata.cap_len, ctx->cfg->snaplen), ts, &options); ctx->last_missed_events = 0; if (ctx->cfg->pcap_file[0] == '-' && ctx->cfg->pcap_file[1] == 0) xpcapng_dump_flush(ctx->pcapng_dumper); } else if (ctx->pcap_dumper) { struct pcap_pkthdr h; h.ts.tv_sec = ts / 1000000000ULL; h.ts.tv_usec = ts % 1000000000ULL / 1000; h.caplen = min(e->metadata.cap_len, ctx->cfg->snaplen); h.len = e->metadata.pkt_len; pcap_dump((u_char *) ctx->pcap_dumper, &h, e->packet); if (ctx->cfg->pcap_file[0] == '-' && ctx->cfg->pcap_file[1] == 0) pcap_dump_flush(ctx->pcap_dumper); } else { int i; char hline[SNPRINTH_MIN_BUFFER_SIZE]; if (ctx->cfg->hex_dump) { printf("%llu.%09lld: %s()@%s%s: packet size %u " "bytes, captured %u bytes on if_index " "%u, rx queue %u, id %"PRIu64"\n", ts / 1000000000ULL, ts % 1000000000ULL, xdp_func, fexit ? "exit" : "entry", fexit ? get_xdp_action_string( e->metadata.action) : "", e->metadata.pkt_len, e->metadata.cap_len, e->metadata.ifindex, e->metadata.rx_queue, ctx->cpu_packet_id[cpu]); for (i = 0; i < e->metadata.cap_len; i += 16) { snprinth(hline, sizeof(hline), e->packet, e->metadata.cap_len, i); printf(" %s\n", hline); } } else { printf("%llu.%09lld: %s()@%s%s: packet size %u " "bytes on if_index %u, rx queue %u, " "id %"PRIu64"\n", ts / 1000000000ULL, ts % 1000000000ULL, xdp_func, fexit ? "exit" : "entry", fexit ? get_xdp_action_string( e->metadata.action) : "", e->metadata.pkt_len,e->metadata.ifindex, e->metadata.rx_queue, ctx->cpu_packet_id[cpu]); } } ctx->captured_packets++; break; case PERF_RECORD_LOST: ctx->missed_events += lost->lost; ctx->last_missed_events += lost->lost; break; } return LIBBPF_PERF_EVENT_CONT; } /***************************************************************************** * get_epoch_to_uptime_delta() *****************************************************************************/ static int get_epoch_to_uptime_delta(uint64_t *delta) { /* This function will calculate the rough delta between uptime * seconds and the epoch time. This is not a precise delta as there is * a delay between calling the two functions below (and time() being in * seconds), but it's good enough to get a general offset. The delta * between packets is still based on the timestamps from the trace * infrastructure. */ struct timespec ts; uint64_t uptime; uint64_t epoch = time(NULL) * 1000000000ULL; if (clock_gettime(CLOCK_MONOTONIC, &ts)) { pr_warn("ERROR: Failed to get CLOCK_MONOTONIC time: %s(%d)", strerror(errno), errno); return -errno; } uptime = ts.tv_sec * 1000000000ULL + ts.tv_nsec; *delta = epoch - uptime; return 0; } /***************************************************************************** * capture_on_legacy_interface() *****************************************************************************/ static bool capture_on_legacy_interface(struct dumpopt *cfg) { bool rc = false; char errbuf[PCAP_ERRBUF_SIZE]; uint64_t captured_packets = 0; pcap_t *pcap = NULL; pcap_dumper_t *pcap_dumper = NULL; struct pcap_stat ps; /* Open pcap handle for live capture. */ if (cfg->rx_capture != RX_FLAG_FENTRY) { pr_warn("ERROR: For legacy capture only \"--rx-capture entry\"" " is supported!\n"); goto error_exit; } pcap = pcap_open_live(cfg->iface.ifname, cfg->snaplen, cfg->promiscuous, 1000, errbuf); if (pcap == NULL) { pr_warn("ERROR: Can't open pcap live interface: %s\n", errbuf); goto error_exit; } /* Open the pcap handle for pcap file. */ if (cfg->pcap_file) { pcap_dumper = pcap_dump_open(pcap, cfg->pcap_file); if (!pcap_dumper) { pr_warn("ERROR: Can't open pcap file for writing!\n"); goto error_exit; } } /* No more error conditions, display some capture information */ fprintf(stderr, "listening on %s, link-type %s (%s), " "capture size %d bytes\n", cfg->iface.ifname, pcap_datalink_val_to_name(pcap_datalink(pcap)), pcap_datalink_val_to_description(pcap_datalink(pcap)), cfg->snaplen); /* Loop for receive packets on live interface. */ exit_pcap = pcap; while (!exit_xdpdump) { const uint8_t *packet; struct pcap_pkthdr h; packet = pcap_next(pcap, &h); if (!packet) continue; if (pcap_dumper) { pcap_dump((u_char *) pcap_dumper, &h, packet); if (cfg->pcap_file[0] == '-' && cfg->pcap_file[1] == 0) pcap_dump_flush(pcap_dumper); } else { size_t i; char hline[SNPRINTH_MIN_BUFFER_SIZE]; if (cfg->hex_dump) { printf("%ld.%06ld: packet size %u bytes, " "captured %u bytes on if_name \"%s\"\n", (long) h.ts.tv_sec, (long) h.ts.tv_usec, h.len, h.caplen, cfg->iface.ifname); for (i = 0; i < h.caplen; i += 16) { snprinth(hline, sizeof(hline), packet, h.caplen, i); printf(" %s\n", hline); } } else { printf("%ld.%06ld: packet size %u bytes on " "if_name \"%s\"\n", (long) h.ts.tv_sec, (long) h.ts.tv_usec, h.len, cfg->iface.ifname); } } captured_packets++; } exit_pcap = NULL; rc = true; fflush(stdout); fprintf(stderr, "\n%"PRIu64" packets captured\n", captured_packets); if (pcap_stats(pcap, &ps) == 0) { fprintf(stderr, "%u packets dropped by kernel\n", ps.ps_drop); if (ps.ps_ifdrop != 0) fprintf(stderr, "%u packets dropped by interface\n", ps.ps_ifdrop); } error_exit: if (pcap_dumper) pcap_dump_close(pcap_dumper); if (pcap) pcap_close(pcap); return rc; } /***************************************************************************** * append_snprintf() *****************************************************************************/ int append_snprintf(char **buf, size_t *buf_len, size_t *offset, const char *format, ...) { int len; va_list args; if (buf == NULL || *buf == NULL || buf_len == NULL || *buf_len <= 0 || offset == NULL || *buf_len - *offset <= 0) return -EINVAL; while (true) { char *new_buf; size_t new_buf_len; va_start(args, format); len = vsnprintf(*buf + *offset, *buf_len - *offset, format, args); va_end(args); if ((size_t)len < (*buf_len - *offset)) { *offset += len; len = 0; break; } if (*buf_len >= 2048) return -ENOMEM; new_buf_len = *buf_len * 2; new_buf = realloc(*buf, new_buf_len); if (!new_buf) return -ENOMEM; *buf = new_buf; *buf_len = new_buf_len; } return len; } /***************************************************************************** * get_program_names_all() *****************************************************************************/ static char *get_program_names_all(struct capture_programs *progs, int skip_index) { char *program_names; size_t size = 128; size_t offset = 0; program_names = malloc(size); if (!program_names) return NULL; for (unsigned int i = 0; i < progs->nr_of_progs; i++) { const char *kname = xdp_program__name(progs->progs[i].prog); const char *fname = progs->progs[i].func; uint32_t id = xdp_program__id(progs->progs[i].prog); if (skip_index != (int)i) { if (append_snprintf(&program_names, &size, &offset, "%s%s@%d", i == 0 ? "" : ",", fname ? fname : kname, id) < 0) { free(program_names); return NULL; } } else { if (append_snprintf(&program_names, &size, &offset, "%s%s@%d", i == 0 ? "" : ",", "", id) < 0) { free(program_names); return NULL; } } } return program_names; } /***************************************************************************** * find_func_matches() *****************************************************************************/ static size_t find_func_matches(const struct btf *btf, const char *func_name, const char **found_name, bool print, int print_id, bool exact) { const struct btf_type *t, *match; size_t len, matches = 0; const char *name; int nr_types, i; if (!btf) { pr_debug("No BTF found for program\n"); return 0; } len = strlen(func_name); nr_types = btf__type_cnt(btf); for (i = 1; i < nr_types; i++) { t = btf__type_by_id(btf, i); if (!btf_is_func(t)) continue; name = btf__name_by_offset(btf, t->name_off); if (!strncmp(name, func_name, len)) { pr_debug("Found func %s matching %s\n", name, func_name); if (print) { if (print_id < 0) pr_warn(" %s\n", name); else pr_warn(" %s@%d\n", name, print_id); } /* Do an exact match if the user specified a function * name, or if there is no possibility of truncation * because the length is different from the truncated * length. */ if (strlen(name) == len && (exact || len != BPF_OBJ_NAME_LEN - 1)) { *found_name = name; return 1; /* exact match */ } /* prefix, may not be unique */ matches++; match = t; } } if (exact) return 0; if (matches == 1) *found_name = btf__name_by_offset(btf, match->name_off); return matches; } /***************************************************************************** * match_target_function() *****************************************************************************/ static int match_target_function(struct dumpopt *cfg, struct capture_programs *all_progs, char *prog_name, int prog_id) { int i; unsigned int matches = 0; for (i = 0; i < (int)all_progs->nr_of_progs; i++) { const char *kname = xdp_program__name(all_progs->progs[i].prog); if (prog_id != -1 && xdp_program__id(all_progs->progs[i].prog) != (uint32_t) prog_id) continue; if (!strncmp(kname, prog_name, strlen(kname))) { if (all_progs->progs[i].func == NULL) { if (find_func_matches(xdp_program__btf(all_progs->progs[i].prog), prog_name, &all_progs->progs[i].func, false, -1, true) == 1) { all_progs->progs[i].rx_capture = cfg->rx_capture; matches++; } else if (strlen(prog_name) <= BPF_OBJ_NAME_LEN - 1) { /* If the user cut and paste the * truncated function name, make sure * we tell him all the possible options! */ matches = UINT_MAX; break; } } else if (!strcmp(all_progs->progs[i].func, prog_name)) { all_progs->progs[i].rx_capture = cfg->rx_capture; matches++; } } if (prog_id != -1) break; } if (!matches) { if (prog_id == -1) pr_warn("ERROR: Can't find function '%s' on interface!\n", prog_name); else pr_warn("ERROR: Can't find function '%s' in interface program %d!\n", prog_name, prog_id); return -ENOENT; } else if (matches == 1) { return 0; } if (matches != UINT_MAX) { pr_warn("ERROR: The function '%s' exists in multiple programs!\n", prog_name); } else { if (prog_id == -1) pr_warn("ERROR: Can't identify the full XDP '%s' function!\n", prog_name); else pr_warn("ERROR: Can't identify the full XDP '%s' function in program %d!\n", prog_name, prog_id); } pr_warn("The following is a list of candidates:\n"); for (i = 0; i < (int)all_progs->nr_of_progs; i++) { uint32_t cur_prog_id = xdp_program__id(all_progs->progs[i].prog); const char *func_dummy; if (prog_id != -1 && cur_prog_id != (uint32_t) prog_id) continue; find_func_matches(xdp_program__btf(all_progs->progs[i].prog), xdp_program__name(all_progs->progs[i].prog), &func_dummy, true, (prog_id == -1 && matches == UINT_MAX) ? -1 : (int) cur_prog_id, false); if (prog_id != -1) break; } pr_warn("Please use the -p option to pick the correct one.\n"); if (!strcmp("all", cfg->program_names)) { char *program_names = get_program_names_all(all_progs, i); if (program_names) { pr_warn("Command line to replace 'all':\n %s\n", program_names); free(program_names); } } return -EAGAIN; } /***************************************************************************** * check_btf() *****************************************************************************/ static bool check_btf(struct xdp_program *prog) { if (xdp_program__btf(prog)) return true; pr_warn("ERROR: xdpdump requires BTF information, but that is missing " "from the loaded XDP program!\n"); return false; } /***************************************************************************** * find_target() * * What is this function trying to do? It will return a list of programs to * capture on, based on the configured program-names. If this parameter is * not given, it will attach to the first (main) program. * * Note that the kernel API will truncate function names at BPF_OBJ_NAME_LEN * so we need to guess the correct function if not explicitly given with * the program-names option. * *****************************************************************************/ static int find_target(struct dumpopt *cfg, struct xdp_multiprog *mp, struct capture_programs *tgt_progs) { const char *func; struct xdp_program *prog, *p; struct capture_programs progs; size_t matches; char *prog_name; char *prog_safe_ptr; char *program_names = cfg->program_names; prog = xdp_multiprog__main_prog(mp); if (!check_btf(prog)) return -EINVAL; /* First take care of the default case, i.e. no function supplied */ if (!program_names) { /* The libxdp code optimization where it skips the dispatcher * if only one program is loaded. If this is the case, we need * to attach to the actual first program, not the dispatcher. */ if (xdp_multiprog__program_count(mp) == 1) { prog = xdp_multiprog__next_prog(NULL, mp); if (!check_btf(prog)) return -EINVAL; } matches = find_func_matches(xdp_program__btf(prog), xdp_program__name(prog), &func, false, -1, false); if (!matches) { pr_warn("ERROR: Can't find function '%s' on interface!\n", xdp_program__name(prog)); return -ENOENT; } else if (matches == 1) { tgt_progs->nr_of_progs = 1; tgt_progs->progs[0].prog = prog; tgt_progs->progs[0].func = func; tgt_progs->progs[0].rx_capture = cfg->rx_capture; return 0; } pr_warn("ERROR: Can't identify the full XDP main function!\n" "The following is a list of candidates:\n"); find_func_matches(xdp_program__btf(prog), xdp_program__name(prog), &func, true, -1, false); pr_warn("Please use the -p option to pick the correct one.\n"); return -EAGAIN; } /* We end up here if we have a configured function(s), which can be * any function in one of the programs attached. In the case of * multiple programs we can even have duplicate functions amongst * programs and we need a way to differentiate. We do this by * supplying the @. See the -D output for the program IDs. * We also have the "all" keyword, which will specify that all * functions need to be traced. */ /* Fill in the all_prog data structure to make matching easier */ memset(&progs, 0, sizeof(progs)); progs.progs[progs.nr_of_progs].prog = prog; matches = find_func_matches(xdp_program__btf(prog), xdp_program__name(prog), &progs.progs[progs.nr_of_progs].func, false, -1, false); if (matches != 1) progs.progs[progs.nr_of_progs].func = NULL; progs.nr_of_progs++; for (p = xdp_multiprog__next_prog(NULL, mp); p; p = xdp_multiprog__next_prog(p, mp)) { progs.progs[progs.nr_of_progs].prog = p; matches = find_func_matches(xdp_program__btf(p), xdp_program__name(p), &progs.progs[progs.nr_of_progs].func, false, -1, false); if (matches != 1) progs.progs[progs.nr_of_progs].func = NULL; progs.nr_of_progs++; if (progs.nr_of_progs >= MAX_LOADED_XDP_PROGRAMS) break; } /* If "all" option is specified create temp program names */ if (!strcmp("all", program_names)) { program_names = get_program_names_all(&progs, -1); if (!program_names) { pr_warn("ERROR: Out of memory for 'all' programs!\n"); return -ENOMEM; } } /* Split up the --program-names and walk over it */ for (prog_name = strtok_r(program_names, ",", &prog_safe_ptr); prog_name != NULL; prog_name = strtok_r(NULL, ",", &prog_safe_ptr)) { int rc; unsigned long id = -1; char *id_str = strchr(prog_name, '@'); char *alloc_name = NULL; if (id_str) { unsigned int i; char *endptr; errno = 0; id_str++; id = strtoul(id_str, &endptr, 10); if ((errno == ERANGE && id == ULONG_MAX) || (errno != 0 && id == 0) || *endptr != '\0' || endptr == id_str) { pr_warn("ERROR: Can't extract valid program id from \"%s\"!\n", prog_name); if (cfg->program_names != program_names) free(program_names); return -EINVAL; } for (i = 0; i < progs.nr_of_progs; i++) { if (id == xdp_program__id(progs.progs[i].prog)) break; } if (i >= progs.nr_of_progs) { pr_warn("ERROR: Invalid program id supplied, \"%s\"!\n", prog_name); if (cfg->program_names != program_names) free(program_names); return -EINVAL; } alloc_name = strndup(prog_name, id_str - prog_name - 1); if (!alloc_name) { pr_warn("ERROR: Out of memory while processing program-name argument!\n"); if (cfg->program_names != program_names) free(program_names); return -ENOMEM; } prog_name = alloc_name; } else { /* If no @id was specified, verify if the program name * was not a program_id. If so, locate the name and * use it in the lookup below. */ char *endptr; unsigned long prog_id; prog_id = strtoul(prog_name, &endptr, 10); if (!((errno == ERANGE && prog_id == ULONG_MAX) || (errno != 0 && prog_id == 0) || *endptr != '\0' || endptr == prog_name)) { for (unsigned int i = 0; i < progs.nr_of_progs; i++) { if (prog_id == xdp_program__id(progs.progs[i].prog)) { alloc_name = strdup(progs.progs[i].func); if (alloc_name) { id = prog_id; prog_name = alloc_name; } break; } } } } rc = match_target_function(cfg, &progs, prog_name, id); free(alloc_name); if (rc < 0) { if (cfg->program_names != program_names) free(program_names); return rc; } } #if 0 /* Removed this optimization for now as it will save one packet when * three programs are loaded, two for four, etc. In addition, it will * make the packet flow looks a bit weird, without it's more clear * which programs the dispatcher has executed. */ if (cfg->rx_capture == (RX_FLAG_FENTRY | RX_FLAG_FEXIT)) { /* If we do entry and exit captures we can remove fentry from * back to back programs to skip storing an identical packet. * We keep fexit due to the reported return code. * * First program is the dispatches (which should not modify * the packet, but we can't be sure). So we skip this and the * first sub-programs fexit). */ for (int i = 2; i < progs.nr_of_progs; i++) if (progs.progs[i-1].rx_capture & RX_FLAG_FENTRY) progs.progs[i].rx_capture &= ~RX_FLAG_FENTRY; } #endif if (cfg->program_names != program_names) free(program_names); /* Copy all the programs that need capture actions */ memset(tgt_progs, 0, sizeof(*tgt_progs)); for (unsigned int i = 0; i < progs.nr_of_progs; i++) { if (!progs.progs[i].rx_capture) continue; tgt_progs->progs[tgt_progs->nr_of_progs].prog = progs.progs[i].prog; tgt_progs->progs[tgt_progs->nr_of_progs].func = progs.progs[i].func; tgt_progs->progs[tgt_progs->nr_of_progs].rx_capture = progs.progs[i].rx_capture; tgt_progs->nr_of_progs++; } return 0; } /***************************************************************************** * get_loaded_program_info() *****************************************************************************/ static char *get_loaded_program_info(struct dumpopt *cfg) { char *info; size_t info_size = 128; size_t info_offset = 0; struct xdp_multiprog *mp = NULL; info = malloc(info_size); if (!info) return NULL; if (append_snprintf(&info, &info_size, &info_offset, "Capture was taken on interface %s, with the " "following XDP programs loaded:\n", cfg->iface.ifname) < 0) goto error_out; mp = xdp_multiprog__get_from_ifindex(cfg->iface.ifindex); if (IS_ERR_OR_NULL(mp)) { if (append_snprintf(&info, &info_size, &info_offset, " %s()\n", "")) goto error_out; } else { struct xdp_program *prog = NULL; if (append_snprintf(&info, &info_size, &info_offset, " %s()\n", xdp_program__name( xdp_multiprog__main_prog(mp))) < 0) goto error_out; while ((prog = xdp_multiprog__next_prog(prog, mp))) { if (append_snprintf(&info, &info_size, &info_offset, " %s()\n", xdp_program__name(prog)) < 0) goto error_out; } xdp_multiprog__close(mp); } return info; error_out: xdp_multiprog__close(mp); free(info); return NULL; } /***************************************************************************** * add_interfaces_to_pcapng() *****************************************************************************/ static bool add_interfaces_to_pcapng(struct dumpopt *cfg, struct xpcapng_dumper *pcapng_dumper, struct capture_programs *progs) { uint64_t if_speed; char if_drv[260]; if_speed = get_if_speed(&cfg->iface); if_drv[0] = 0; get_if_drv_info(&cfg->iface, if_drv, sizeof(if_drv)); for (unsigned int i = 0; i < progs->nr_of_progs; i++) { char if_name[128]; if (try_snprintf(if_name, sizeof(if_name), "%s:%s()@fentry", cfg->iface.ifname, progs->progs[i].func)) { pr_warn("ERROR: Could not format interface name, %s:%s()@fentry!\n", cfg->iface.ifname, progs->progs[i].func); return false; } if (xpcapng_dump_add_interface(pcapng_dumper, cfg->snaplen, if_name, NULL, NULL, if_speed, 9 /* nsec resolution */, if_drv) < 0) { pr_warn("ERROR: Can't add %s interface to PcapNG file!\n", if_name); return false; } if (try_snprintf(if_name, sizeof(if_name), "%s:%s()@fexit", cfg->iface.ifname, progs->progs[i].func)) { pr_warn("ERROR: Could not format interface name, %s:%s()@fexit!\n", cfg->iface.ifname, progs->progs[i].func); return false; } if (xpcapng_dump_add_interface(pcapng_dumper, cfg->snaplen, if_name, NULL, NULL, if_speed, 9 /* nsec resolution */, if_drv) < 0) { pr_warn("ERROR: Can't add %s interface to PcapNG file!\n", if_name); return false; } } return true; } static void print_compat_error(const char *what) { #if defined(__x86_64__) || defined(__i686__) pr_warn("ERROR: The kernel does not support " "fentry %s because it is too old!", what); #else pr_warn("ERROR: The kernel does not support " "fentry %s on the current CPU architecture!", what); #endif } /***************************************************************************** * load_and_attach_trace() *****************************************************************************/ static bool load_and_attach_trace(struct dumpopt *cfg, struct capture_programs *progs, unsigned int idx) { int err; struct bpf_object *trace_obj = NULL; struct bpf_program *trace_prog_fentry; struct bpf_program *trace_prog_fexit; struct bpf_link *trace_link_fentry = NULL; struct bpf_link *trace_link_fexit = NULL; struct bpf_map *perf_map; struct bpf_map *data_map; struct trace_configuration trace_cfg; if (idx >= progs->nr_of_progs || progs->nr_of_progs == 0) { pr_warn("ERROR: Attach program ID invalid!\n"); return false; } progs->progs[idx].attached = false; if (progs->progs[idx].rx_capture == 0) { pr_warn("ERROR: No RX capture mode to attach to!\n"); return false; } silence_libbpf_logging(); rlimit_loop: /* Load the trace program object */ trace_obj = open_bpf_file("xdpdump_bpf.o", NULL); err = libbpf_get_error(trace_obj); if (err) { pr_warn("ERROR: Can't open XDP trace program: %s(%d)\n", strerror(-err), err); trace_obj = NULL; goto error_exit; } /* Set the ifIndex in the DATA map */ data_map = bpf_object__find_map_by_name(trace_obj, "xdpdump_.data"); if (!data_map) { pr_warn("ERROR: Can't find the .data MAP in the trace " "program!\n"); goto error_exit; } if (bpf_map__value_size(data_map) != sizeof(trace_cfg)) { pr_warn("ERROR: Can't find the correct sized .data MAP in the " "trace program!\n"); goto error_exit; } trace_cfg.capture_if_ifindex = cfg->iface.ifindex; trace_cfg.capture_snaplen = cfg->snaplen; trace_cfg.capture_prog_index = idx; if (bpf_map__set_initial_value(data_map, &trace_cfg, sizeof(trace_cfg))) { pr_warn("ERROR: Can't set initial .data MAP in the trace " "program!\n"); goto error_exit; } /* Locate the fentry and fexit functions */ trace_prog_fentry = bpf_object__find_program_by_name(trace_obj, "trace_on_entry"); if (!trace_prog_fentry) { pr_warn("ERROR: Can't find XDP trace fentry function!\n"); goto error_exit; } trace_prog_fexit = bpf_object__find_program_by_name(trace_obj, "trace_on_exit"); if (!trace_prog_fexit) { pr_warn("ERROR: Can't find XDP trace fexit function!\n"); goto error_exit; } /* Before we can load the object in memory we need to set the attach * point to our function. */ bpf_program__set_expected_attach_type(trace_prog_fentry, BPF_TRACE_FENTRY); bpf_program__set_expected_attach_type(trace_prog_fexit, BPF_TRACE_FEXIT); bpf_program__set_attach_target(trace_prog_fentry, xdp_program__fd(progs->progs[idx].prog), progs->progs[idx].func); bpf_program__set_attach_target(trace_prog_fexit, xdp_program__fd(progs->progs[idx].prog), progs->progs[idx].func); /* Reuse the xdpdump_perf_map for all programs */ perf_map = bpf_object__find_map_by_name(trace_obj, "xdpdump_perf_map"); if (!perf_map) { pr_warn("ERROR: Can't find xdpdump_perf_map in trace program!\n"); goto error_exit; } if (idx != 0) { err = bpf_map__reuse_fd(perf_map, progs->progs[0].perf_map_fd); if (err) { pr_warn("ERROR: Can't reuse xdpdump_perf_map: %s\n", strerror(-err)); goto error_exit; } } /* Load the bpf object into memory */ err = bpf_object__load(trace_obj); if (err) { if (err == -EPERM && !double_rlimit()) { bpf_object__close(trace_obj); goto rlimit_loop; } else if (err == -E2BIG) { print_compat_error("function load"); } else { char err_msg[STRERR_BUFSIZE]; libbpf_strerror(err, err_msg, sizeof(err_msg)); pr_warn("ERROR: Can't load eBPF object: %s(%d)\n", err_msg, err); } goto error_exit; } /* Attach trace programs only in the direction(s) needed */ if (progs->progs[idx].rx_capture & RX_FLAG_FENTRY) { trace_link_fentry = bpf_program__attach_trace(trace_prog_fentry); err = libbpf_get_error(trace_link_fentry); if (err) { if (err == -ENOTSUPP) print_compat_error("function attach"); else pr_warn("ERROR: Can't attach XDP trace fentry " "function: %s\n", strerror(-err)); goto error_exit; } } if (progs->progs[idx].rx_capture & RX_FLAG_FEXIT) { trace_link_fexit = bpf_program__attach_trace(trace_prog_fexit); err = libbpf_get_error(trace_link_fexit); if (err) { pr_warn("ERROR: Can't attach XDP trace fexit function: %s\n", strerror(-err)); goto error_exit; } } /* Figure out the fd for the BPF_MAP_TYPE_PERF_EVENT_ARRAY trace map. */ if (idx == 0) { progs->progs[idx].perf_map_fd = bpf_map__fd(perf_map); if (progs->progs[idx].perf_map_fd < 0) { pr_warn("ERROR: Can't get xdpdump_perf_map file descriptor: %s\n", strerror(errno)); return false; } } else { progs->progs[idx].perf_map_fd = progs->progs[0].perf_map_fd; } progs->progs[idx].attached = true; progs->progs[idx].fentry_link = trace_link_fentry; progs->progs[idx].fexit_link = trace_link_fexit; progs->progs[idx].prog_obj = trace_obj; return true; error_exit: bpf_link__destroy(trace_link_fentry); bpf_link__destroy(trace_link_fexit); bpf_object__close(trace_obj); return false; } /***************************************************************************** * load_and_attach_traces() *****************************************************************************/ static bool load_and_attach_traces(struct dumpopt *cfg, struct capture_programs *progs) { for (unsigned int i = 0; i < progs->nr_of_progs; i++) if (!load_and_attach_trace(cfg, progs, i)) return false; return true; } /***************************************************************************** * detach_trace() *****************************************************************************/ static void detach_trace(struct capture_programs *progs, unsigned int idx) { if (idx >= progs->nr_of_progs || progs->nr_of_progs == 0 || !progs->progs[idx].attached) return; bpf_link__destroy(progs->progs[idx].fentry_link); bpf_link__destroy(progs->progs[idx].fexit_link); bpf_object__close(progs->progs[idx].prog_obj); progs->progs[idx].attached = false; } /***************************************************************************** * detach_traces() *****************************************************************************/ static void detach_traces(struct capture_programs *progs) { for (unsigned int i = 0; i < progs->nr_of_progs; i++) detach_trace(progs, i); } /***************************************************************************** * load_xdp_trace_program() *****************************************************************************/ static bool load_xdp_trace_program(struct dumpopt *cfg, struct capture_programs *progs) { DECLARE_LIBXDP_OPTS(xdp_program_opts, opts, 0); int fd, rc; char errmsg[STRERR_BUFSIZE]; struct xdp_program *prog; struct bpf_map *perf_map; struct bpf_map *data_map; struct trace_configuration trace_cfg; if (!cfg || !progs) return false; silence_libbpf_logging(); silence_libxdp_logging(); opts.find_filename = "xdpdump_xdp.o"; opts.prog_name = "xdpdump"; prog = xdp_program__create(&opts); if (libxdp_get_error(prog)) { int err = libxdp_get_error(prog); libxdp_strerror(err, errmsg, sizeof(errmsg)); pr_warn("ERROR: Can't open XDP trace program: %s(%d)\n", errmsg, err); return false; } perf_map = bpf_object__find_map_by_name(xdp_program__bpf_obj(prog), "xdpdump_perf_map"); if (!perf_map) { pr_warn("ERROR: Can't find xdpdump_perf_map in the xdp program!\n"); goto error_exit; } /* Set the trace configuration in the DATA map */ data_map = bpf_object__find_map_by_name(xdp_program__bpf_obj(prog), "xdpdump_.data"); if (!data_map) { pr_warn("ERROR: Can't find the .data MAP in the xdp program!\n"); goto error_exit; } if (bpf_map__value_size(data_map) != sizeof(trace_cfg)) { pr_warn("ERROR: Can't find the correct sized .data MAP in the xdp program!\n"); goto error_exit; } trace_cfg.capture_if_ifindex = cfg->iface.ifindex; trace_cfg.capture_snaplen = cfg->snaplen; trace_cfg.capture_prog_index = 0; if (bpf_map__set_initial_value(data_map, &trace_cfg, sizeof(trace_cfg))) { pr_warn("ERROR: Can't set initial .data MAP in the xdp program!\n"); goto error_exit; } do { rc = xdp_program__attach(prog, cfg->iface.ifindex, cfg->load_xdp_mode, 0); } while (rc == -EPERM && !double_rlimit()); if (rc) { libxdp_strerror(rc, errmsg, sizeof(errmsg)); pr_warn("ERROR: Can't attach XDP trace program: %s(%d)\n", errmsg, rc); goto error_exit; } fd = bpf_map__fd(perf_map); if (fd < 0) { pr_warn("ERROR: Can't get xdpdump_perf_map file descriptor: %s\n", strerror(errno)); xdp_program__detach(prog, cfg->iface.ifindex, cfg->load_xdp_mode, 0); goto error_exit; } progs->progs[0].prog = prog; progs->progs[0].func = xdp_program__name(prog); progs->progs[0].rx_capture = RX_FLAG_FENTRY; progs->progs[0].perf_map_fd = fd; progs->nr_of_progs = 1; return true; error_exit: xdp_program__close(prog); return false; } /***************************************************************************** * unload_xdp_trace_program() *****************************************************************************/ static void unload_xdp_trace_program(struct dumpopt *cfg, struct capture_programs *progs) { if (!progs || progs->nr_of_progs != 1) return; xdp_program__detach(progs->progs[0].prog, cfg->iface.ifindex, cfg->load_xdp_mode, 0); xdp_program__close(progs->progs[0].prog); progs->progs[0].prog = NULL; progs->nr_of_progs = 0; } /***************************************************************************** * capture_on_interface() *****************************************************************************/ static bool capture_on_interface(struct dumpopt *cfg) { int err, cnt; bool rc = false; bool load_xdp = false; bool promiscuous = false; pcap_t *pcap = NULL; pcap_dumper_t *pcap_dumper = NULL; struct xpcapng_dumper *pcapng_dumper = NULL; struct perf_buffer *perf_buf = NULL; struct perf_event_attr perf_attr = { .sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_TIME, .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_BPF_OUTPUT, .sample_period = 1, .wakeup_events = 1, }; struct perf_handler_ctx perf_ctx; struct xdp_multiprog *mp; struct capture_programs tgt_progs = {}; mp = xdp_multiprog__get_from_ifindex(cfg->iface.ifindex); if (IS_ERR_OR_NULL(mp) || xdp_multiprog__main_prog(mp) == NULL) { if (!cfg->load_xdp) { pr_warn("WARNING: Specified interface does not have an XDP program loaded%s," "\n capturing in legacy mode!\n", IS_ERR_OR_NULL(mp) ? "" : " in software"); xdp_multiprog__close(mp); return capture_on_legacy_interface(cfg); } pr_warn("WARNING: Specified interface does not have an XDP program loaded%s!\n" " Will load a capture only XDP program!\n", IS_ERR_OR_NULL(mp) ? "" : " in software"); load_xdp = true; } if (!load_xdp) { if (find_target(cfg, mp, &tgt_progs)) goto error_exit; if (tgt_progs.nr_of_progs == 0) { pr_warn("ERROR: Failed finding any attached XDP program!\n"); goto error_exit; } } /* Enable promiscuous mode if requested. */ if (cfg->promiscuous) { err = set_if_promiscuous_mode(&cfg->iface, true, &cfg->promiscuous); if (err) { pr_warn("ERROR: Failed setting promiscuous mode: %s(%d)\n", strerror(-err), -err); goto error_exit; } promiscuous = true; } /* Load and attach programs */ if (!load_xdp) { if (!load_and_attach_traces(cfg, &tgt_progs)) { /* Actual errors are reported in the above function. */ goto error_exit; } } else { if (!load_xdp_trace_program(cfg, &tgt_progs)) { /* Actual errors are reported in the above function. */ goto error_exit; } } /* Open the pcap handle */ if (cfg->pcap_file) { if (cfg->use_pcap) { pcap = pcap_open_dead(DLT_EN10MB, cfg->snaplen); if (!pcap) { pr_warn("ERROR: Can't open pcap dead handler!\n"); goto error_exit; } pcap_dumper = pcap_dump_open(pcap, cfg->pcap_file); if (!pcap_dumper) { pr_warn("ERROR: Can't open pcap file for writing!\n"); goto error_exit; } } else { char *program_info; struct utsname utinfo; char os_info[260]; memset(&utinfo, 0, sizeof(utinfo)); uname(&utinfo); os_info[0] = 0; if (try_snprintf(os_info, sizeof(os_info), "%s %s %s %s", utinfo.sysname, utinfo.nodename, utinfo.release, utinfo.version)) { pr_warn("ERROR: Could not format OS information!\n"); goto error_exit; } program_info = get_loaded_program_info(cfg); if (!program_info) { pr_warn("ERROR: Could not format program information!\n"); goto error_exit; } pcapng_dumper = xpcapng_dump_open(cfg->pcap_file, program_info, utinfo.machine, os_info, "xdpdump v" TOOLS_VERSION); free(program_info); if (!pcapng_dumper) { pr_warn("ERROR: Can't open PcapNG file for writing!\n"); goto error_exit; } if (!add_interfaces_to_pcapng(cfg, pcapng_dumper, &tgt_progs)) { /* Error output is handled in * add_interfaces_to_pcapng() */ goto error_exit; } } } /* Setup perf context */ memset(&perf_ctx, 0, sizeof(perf_ctx)); perf_ctx.cfg = cfg; perf_ctx.xdp_progs = &tgt_progs; perf_ctx.pcap = pcap; perf_ctx.pcap_dumper = pcap_dumper; perf_ctx.pcapng_dumper = pcapng_dumper; if (get_epoch_to_uptime_delta(&perf_ctx.epoch_delta)) goto error_exit; /* Determine the perf wakeup_events value to use */ #ifdef HAVE_LIBBPF_PERF_BUFFER__CONSUME if (cfg->pcap_file) { if (cfg->pcap_file[0] == '-' && cfg->pcap_file[1] == 0) { /* If we pipe trough stdio we do not want to buffer * any packets in the perf ring. */ perf_attr.wakeup_events = 1; } else { /* * If no specific wakeup value is specified assume * an average packet size of 2K we would like to * fill without losing any packets. */ uint32_t events = PERF_MMAP_PAGE_COUNT * getpagesize() / (libbpf_num_possible_cpus() ?: 1) / 2048; if (events > 0) perf_attr.wakeup_events = min(PERF_MAX_WAKEUP_EVENTS, events); } } else { /* Only buffer in perf ring when using pcap_file */ perf_attr.wakeup_events = 1; } /* Cmdline option --perf-wakeup can override buffering levels */ if (cfg->perf_wakeup) perf_attr.wakeup_events = cfg->perf_wakeup; #endif pr_debug("perf-wakeup value uses is %u\n", perf_attr.wakeup_events); #ifdef HAVE_LIBBPF_PERF_BUFFER__NEW_RAW /* the configure check looks for the 6-argument variant of the function */ perf_buf = perf_buffer__new_raw(tgt_progs.progs[0].perf_map_fd, PERF_MMAP_PAGE_COUNT, &perf_attr, handle_perf_event, &perf_ctx, NULL); #else struct perf_buffer_raw_opts perf_opts = {}; /* Setup perf ring buffers */ perf_opts.attr = &perf_attr; perf_opts.event_cb = handle_perf_event; perf_opts.ctx = &perf_ctx; perf_buf = perf_buffer__new_raw(tgt_progs.progs[0].perf_map_fd, PERF_MMAP_PAGE_COUNT, &perf_opts); #endif if (perf_buf == NULL) { pr_warn("ERROR: Failed to allocate raw perf buffer: %s(%d)", strerror(errno), errno); goto error_exit; } /* No more error conditions, display some capture information */ fprintf(stderr, "listening on %s, ingress XDP program ", cfg->iface.ifname); for (unsigned int i = 0; i < tgt_progs.nr_of_progs; i++) fprintf(stderr, "ID %u func %s, ", xdp_program__id(tgt_progs.progs[i].prog), tgt_progs.progs[i].func); fprintf(stderr, "capture mode %s, capture size %d bytes\n", get_capture_mode_string(tgt_progs.progs[0].rx_capture), cfg->snaplen); fflush(stderr); /* Loop trough the dumper */ while (!exit_xdpdump) { cnt = perf_buffer__poll(perf_buf, 1000); if (cnt < 0 && errno != EINTR) { pr_warn("ERROR: Perf buffer polling failed: %s(%d)", strerror(errno), errno); goto error_exit; } } #ifdef HAVE_LIBBPF_PERF_BUFFER__CONSUME perf_buffer__consume(perf_buf); #endif fflush(stdout); fprintf(stderr, "\n%"PRIu64" packets captured\n", perf_ctx.captured_packets); fprintf(stderr, "%"PRIu64" packets dropped by perf ring\n", perf_ctx.missed_events); rc = true; error_exit: /* Cleanup all our resources */ if (promiscuous && cfg->promiscuous) { err = set_if_promiscuous_mode(&cfg->iface, false, NULL); if (err) pr_warn("ERROR: Failed disabling promiscuous mode: " "%s(%d)\n", strerror(-err), -err); } perf_buffer__free(perf_buf); xpcapng_dump_close(pcapng_dumper); if (pcap_dumper) pcap_dump_close(pcap_dumper); if (pcap) pcap_close(pcap); if (load_xdp) unload_xdp_trace_program(cfg, &tgt_progs); else detach_traces(&tgt_progs); xdp_multiprog__close(mp); return rc; } /***************************************************************************** * signal_handler() *****************************************************************************/ static void signal_handler(__unused int signo) { exit_xdpdump = true; if (exit_pcap) pcap_breakloop(exit_pcap); } /***************************************************************************** * main() *****************************************************************************/ int main(int argc, char **argv) { if (parse_cmdline_args(argc, argv, xdpdump_options, &cfg_dumpopt, sizeof(cfg_dumpopt), PROG_NAME, PROG_NAME, "XDPDump tool to dump network traffic", &defaults_dumpopt) != 0) return EXIT_FAILURE; /* If all the options are parsed ok, make sure we are root! */ if (check_bpf_environ()) return EXIT_FAILURE; if (cfg_dumpopt.snaplen == 0) cfg_dumpopt.snaplen = DEFAULT_SNAP_LEN; if (cfg_dumpopt.rx_capture == 0) cfg_dumpopt.rx_capture = RX_FLAG_FENTRY; /* See if we need to dump interfaces and exit */ if (cfg_dumpopt.list_interfaces) { if (iface_print_status(NULL)) return EXIT_SUCCESS; return EXIT_FAILURE; } /* Check if the system does not have more cores than we assume. */ if (sysconf(_SC_NPROCESSORS_CONF) > MAX_CPUS) { pr_warn("ERROR: System has more cores (%ld) than maximum " "supported (%d)!\n", sysconf(_SC_NPROCESSORS_CONF), MAX_CPUS); return EXIT_FAILURE; } /* From here on we assume we need to capture data on an interface */ if (signal(SIGINT, signal_handler) == SIG_ERR || signal(SIGHUP, signal_handler) == SIG_ERR || signal(SIGTERM, signal_handler) == SIG_ERR) { pr_warn("ERROR: Failed assigning signal handler: %s\n", strerror(errno)); return EXIT_FAILURE; } if (cfg_dumpopt.iface.ifname == NULL) { pr_warn("ERROR: You must specific an interface to capture on!\n"); return EXIT_FAILURE; } if (!capture_on_interface(&cfg_dumpopt)) return EXIT_FAILURE; return EXIT_SUCCESS; } xdp-tools-1.6.1/xdp-dump/xdpdump.h000066400000000000000000000036241514310632100170410ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 /****************************************************************************** * Multiple include protection ******************************************************************************/ #ifndef _XDPDUMP_H_ #define _XDPDUMP_H_ /****************************************************************************** * General definitions ******************************************************************************/ #define PERF_MAX_WAKEUP_EVENTS 64 #define PERF_MMAP_PAGE_COUNT 256 #define MAX_CPUS 512 /****************************************************************************** * General used macros ******************************************************************************/ #ifndef __packed #define __packed __attribute__((packed)) #endif /***************************************************************************** * trace configuration structure *****************************************************************************/ struct trace_configuration { __u32 capture_if_ifindex; __u32 capture_snaplen; __u32 capture_prog_index; }; /***************************************************************************** * perf data structures *****************************************************************************/ #define MDF_DIRECTION_FEXIT 1 struct pkt_trace_metadata { __u32 ifindex; __u32 rx_queue; __u16 pkt_len; __u16 cap_len; __u16 flags; __u16 prog_index; int action; } __packed; #ifndef __bpf__ struct perf_sample_event { struct perf_event_header header; __u64 time; __u32 size; struct pkt_trace_metadata metadata; unsigned char packet[]; }; struct perf_lost_event { struct perf_event_header header; __u64 id; __u64 lost; }; #endif /****************************************************************************** * End-of include file ******************************************************************************/ #endif /* _XDPDUMP_H_ */ xdp-tools-1.6.1/xdp-dump/xdpdump_bpf.c000066400000000000000000000073771514310632100176740ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 /***************************************************************************** * Include files *****************************************************************************/ #include #include #include #include #include "xdpdump.h" /***************************************************************************** * Macros *****************************************************************************/ #define min(x,y) ((x)<(y) ? x : y) /***************************************************************************** * (re)definition of kernel data structures for use with BTF *****************************************************************************/ struct net_device { /* Structure does not need to contain all entries, * as "preserve_access_index" will use BTF to fix this... */ int ifindex; } __attribute__((preserve_access_index)); struct xdp_rxq_info { /* Structure does not need to contain all entries, * as "preserve_access_index" will use BTF to fix this... */ struct net_device *dev; __u32 queue_index; } __attribute__((preserve_access_index)); struct xdp_buff { void *data; void *data_end; void *data_meta; void *data_hard_start; unsigned long handle; struct xdp_rxq_info *rxq; } __attribute__((preserve_access_index)); /***************************************************************************** * Local definitions and global variables *****************************************************************************/ struct { __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); __uint(max_entries, MAX_CPUS); __type(key, int); __type(value, __u32); } xdpdump_perf_map SEC(".maps"); /***************************************************************************** * .data section value storing the capture configuration *****************************************************************************/ struct trace_configuration trace_cfg SEC(".data"); /***************************************************************************** * trace_to_perf_buffer() *****************************************************************************/ static inline void trace_to_perf_buffer(struct xdp_buff *xdp, bool fexit, int action) { void *data_end = (void *)(long)xdp->data_end; void *data = (void *)(long)xdp->data; struct pkt_trace_metadata metadata; if (data >= data_end || trace_cfg.capture_if_ifindex != xdp->rxq->dev->ifindex) return; metadata.prog_index = trace_cfg.capture_prog_index; metadata.ifindex = xdp->rxq->dev->ifindex; metadata.rx_queue = xdp->rxq->queue_index; metadata.pkt_len = (__u16)(data_end - data); metadata.cap_len = min(metadata.pkt_len, trace_cfg.capture_snaplen); metadata.action = action; metadata.flags = 0; if (fexit) metadata.flags |= MDF_DIRECTION_FEXIT; bpf_xdp_output(xdp, &xdpdump_perf_map, ((__u64) metadata.cap_len << 32) | BPF_F_CURRENT_CPU, &metadata, sizeof(metadata)); } /***************************************************************************** * trace_on_entry() *****************************************************************************/ SEC("fentry/func") int BPF_PROG(trace_on_entry, struct xdp_buff *xdp) { trace_to_perf_buffer(xdp, false, 0); return 0; } /***************************************************************************** * trace_on_exit() *****************************************************************************/ SEC("fexit/func") int BPF_PROG(trace_on_exit, struct xdp_buff *xdp, int ret) { trace_to_perf_buffer(xdp, true, ret); return 0; } /***************************************************************************** * License *****************************************************************************/ char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/xdp-dump/xdpdump_xdp.c000066400000000000000000000044271514310632100177110ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 /***************************************************************************** * Include files *****************************************************************************/ #include #include #include #include #include "xdpdump.h" /***************************************************************************** * Macros *****************************************************************************/ #define min(x, y) ((x) < (y) ? x : y) /***************************************************************************** * Local definitions and global variables *****************************************************************************/ struct { __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); __uint(max_entries, MAX_CPUS); __type(key, int); __type(value, __u32); } xdpdump_perf_map SEC(".maps"); /***************************************************************************** * .data section value storing the capture configuration *****************************************************************************/ struct trace_configuration trace_cfg SEC(".data"); /***************************************************************************** * XDP trace program *****************************************************************************/ SEC("xdp") int xdpdump(struct xdp_md *xdp) { void *data_end = (void *)(long)xdp->data_end; void *data = (void *)(long)xdp->data; struct pkt_trace_metadata metadata; if (data >= data_end || trace_cfg.capture_if_ifindex != xdp->ingress_ifindex) return XDP_PASS; metadata.prog_index = trace_cfg.capture_prog_index; metadata.ifindex = xdp->ingress_ifindex; metadata.rx_queue = xdp->rx_queue_index; metadata.pkt_len = (__u16)(data_end - data); metadata.cap_len = min(metadata.pkt_len, trace_cfg.capture_snaplen); metadata.action = 0; metadata.flags = 0; bpf_perf_event_output(xdp, &xdpdump_perf_map, ((__u64) metadata.cap_len << 32) | BPF_F_CURRENT_CPU, &metadata, sizeof(metadata)); return XDP_PASS; } /***************************************************************************** * License *****************************************************************************/ char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/xdp-filter/000077500000000000000000000000001514310632100155225ustar00rootroot00000000000000xdp-tools-1.6.1/xdp-filter/.gitignore000066400000000000000000000000401514310632100175040ustar00rootroot00000000000000*.ll xdp-filter prog_features.h xdp-tools-1.6.1/xdp-filter/Makefile000066400000000000000000000012251514310632100171620ustar00rootroot00000000000000# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) XDP_TARGETS := xdpfilt_dny_udp xdpfilt_dny_tcp xdpfilt_dny_ip \ xdpfilt_dny_eth xdpfilt_dny_all \ xdpfilt_alw_udp xdpfilt_alw_tcp xdpfilt_alw_ip \ xdpfilt_alw_eth xdpfilt_alw_all TOOL_NAME := xdp-filter USER_TARGETS := xdp-filter EXTRA_DEPS := xdpfilt_prog.h MAN_PAGE := xdp-filter.8 TEST_FILE := tests/test-xdp-filter.sh TEST_FILE_DEPS := $(wildcard tests/*.py) USER_GEN := prog_features.h EXTRA_USER_DEPS := $(USER_GEN) LIB_DIR = ../lib include $(LIB_DIR)/common.mk prog_features.h: ${XDP_TARGETS:=.o} extract_features.sh $(Q)sh extract_features.sh $^ > $@ || ( ret=$$?; rm -f $@; exit $$ret ) xdp-tools-1.6.1/xdp-filter/README.org000066400000000000000000000247041514310632100171770ustar00rootroot00000000000000#+EXPORT_FILE_NAME: xdp-filter #+TITLE: xdp-filter #+MAN_CLASS_OPTIONS: :section-id "8\" \"DATE\" \"VERSION\" \"A simple XDP-powered packet filter" # This file serves both as a README on github, and as the source for the man # page; the latter through the org-mode man page export support. # . # To export the man page, simply use the org-mode exporter; (require 'ox-man) if # it's not available. There's also a Makefile rule to export it. * xdp-filter - a simple XDP-powered packet filter XDP-filter is a packet filtering utility powered by XDP. It is deliberately simple and so does not have the same matching capabilities as, e.g., netfilter. Instead, thanks to XDP, it can achieve very high drop rates: tens of millions of packets per second on a single CPU core. ** Running xdp-filter The syntax for running xdp-filter is: #+begin_src sh xdp-filter COMMAND [options] Where COMMAND can be one of: load - load xdp-filter on an interface unload - unload xdp-filter from an interface port - add a port to the filter list ip - add an IP address to the filter list ether - add an Ethernet MAC address to the filter list status - show current xdp-filter status poll - poll statistics output help - show the list of available commands #+end_src Each command, and its options are explained below. Or use =xdp-filter COMMAND --help= to see the options for each command. * The LOAD command To use =xdp-filter=, it must first be loaded onto an interface. This is accomplished with the =load= command, which takes the name of the interface as a parameter, and optionally allows specifying the features that should be included. By default all features are loaded, but de-selecting some features can speed up the packet matching, and increase performance by a substantial amount. The syntax for the =load= command is: =xdp-filter load [options] = Where == is the name of the interface to load =xdp-filter= onto, and must be specified. The supported options are: ** -m, --mode Specifies which mode to load the XDP program to be loaded in. The valid values are 'native', which is the default in-driver XDP mode, 'skb', which causes the so-called /skb mode/ (also known as /generic XDP/) to be used, or 'hw' which causes the program to be offloaded to the hardware. ** -p, --policy This sets the policy =xdp-filter= applies to packets *not* matched by any of the filter rules. The default is /allow/, in which packets not matching any rules are allowed to pass. The other option is /deny/, in which *all* packets are dropped *except* those matched by the filter options. =xdp-filter= cannot be loaded simultaneously in /deny/ and /allow/ policy modes on the system. Note that loading =xdp-filter= in /deny/ mode will drop all traffic on the interface until suitable allow rules are installed, so some care is needed to avoid being locked out of a remote system. ** -f, --features Use this option to select which features to include when loaded =xdp-filter=. The default is to load all available features. So select individual features specify one or more of these: * *tcp*: Support filtering on TCP port number * *udp*: Support filtering on UDP port number * *ipv6*: Support filtering on IPv6 addresses * *ipv4*: Support filtering on IPv4 addresses * *ethernet*: Support filtering on Ethernet MAC addresses Specify multiple features by separating them with a comma. E.g.: =tcp,udp,ipv6=. ** -v, --verbose Enable debug logging. Specify twice for even more verbosity. ** -h, --help Display a summary of the available options * The UNLOAD command The =unload= command unloads =xdp-filter= from one (or all) interfaces, and cleans up the program state. The syntax for the =load= command is: =xdp-filter unload [options] = Where == is the name of the interface to unload =xdp-filter= from, and must be specified unless the *--all* option is used. The supported options are: ** -a, --all Specify this option to remove =xdp-filter= from all interfaces it was loaded onto. If this option is specified, no == is needed. This option can also be used to clean up all =xdp-filter= state if the XDP program(s) were unloaded by other means. ** -k, --keep-maps Specify this option to prevent =xdp-filter= from clearing its map state. By default, all BPF maps no longer needed by any loaded program are removed. However, this will also remove the contents of the maps (the filtering rules), so this option can be used to keep the maps around so the rules persist until =xdp-filter= is loaded again. ** -v, --verbose Enable debug logging. Specify twice for even more verbosity. ** -h, --help Display a summary of the available options * The PORT command Use the =port= command to add a TCP or UDP port to the =xdp-filter= match list. For this to work, =xdp-filter= must be loaded with either the *udp* or the *tcp* feature (or both) on at least one interface. The syntax for the =port= command is: =xdp-filter port [options] = Where == is the port number to add (or remove if the *--remove* is specified). The supported options are: ** -r, --remove Remove the port instead of adding it. ** -m, --mode Select filtering mode. Valid options are *src* and *dst*, both of which may be specified as =src,dst=. If *src* is specified, the port number will added as a /source port/ match, while if *dst* is specified, the port number will be added as a /destination port/ match. If both are specified, a packet will be matched if *either* its source or destination port is the specified port number. ** -p, --proto Specify one (or both) of *udp* and/or *tcp* to match UDP or TCP ports, respectively. ** -s, --status If this option is specified, the current list of matched ports will be printed after inserting the port number. Otherwise, nothing will be printed. ** -v, --verbose Enable debug logging. Specify twice for even more verbosity. ** -h, --help Display a summary of the available options * The IP command Use the =ip= command to add an IPv6 or an IPv4 address to the =xdp-filter= match list. The syntax for the =ip= command is: =xdp-filter ip [options] = Where == is the IP address to add (or remove if the *--remove* is specified). Either IPv4 or IPv6 addresses can be specified, but =xdp-filter= must be loaded with the corresponding features (*ipv4* and *ipv6*, respectively). The supported options are: ** -r, --remove Remove the IP address instead of adding it. ** -m, --mode Select filtering mode. Valid options are *src* and *dst*, both of which may be specified as =src,dst=. If *src* is specified, the IP address will added as a /source IP/ match, while if *dst* is specified, the IP address will be added as a /destination IP/ match. If both are specified, a packet will be matched if *either* its source or destination IP is the specified IP address. ** -s, --status If this option is specified, the current list of matched ips will be printed after inserting the IP address. Otherwise, nothing will be printed. ** -v, --verbose Enable debug logging. Specify twice for even more verbosity. ** -h, --help Display a summary of the available options * The ETHER command Use the =ether= command to add an Ethernet MAC address to the =xdp-filter= match list. For this to work, =xdp-filter= must be loaded with either the *ethernet* feature on at least one interface. The syntax for the =ether= command is: =xdp-filter ether [options] = Where == is the MAC address to add (or remove if the *--remove* is specified). The supported options are: ** -r, --remove Remove the MAC address instead of adding it. ** -m, --mode Select filtering mode. Valid options are *src* and *dst*, both of which may be specified as =src,dst=. If *src* is specified, the MAC address will added as a /source MAC/ match, while if *dst* is specified, the MAC address will be added as a /destination MAC/ match. If both are specified, a packet will be matched if *either* its source or destination MAC is the specified MAC address. ** -s, --status If this option is specified, the current list of matched ips will be printed after inserting the MAC address. Otherwise, nothing will be printed. ** -v, --verbose Enable debug logging. Specify twice for even more verbosity. ** -h, --help Display a summary of the available options * The STATUS command The =status= command prints the current status of =xdp-filter=: Which interfaces it is loaded on, the current list of rules, and some statistics for how many packets have been processed in total, and how many times each rule has been hit. The syntax for the =status= command is: =xdp-filter status [options]= Where the supported options are: ** -v, --verbose Enable debug logging. Specify twice for even more verbosity. ** -h, --help Display a summary of the available options * The POLL command The =poll= command periodically polls the =xdp-filter= statistics map and prints out the total number of packets and bytes processed by =xdp-filter=, as well as the number in the last polling interval, converted to packets (and bytes) per second. This can be used to inspect the performance of =xdp-filter=, and to compare the performance of the different feature sets selectable by the =load= parameter. The syntax for the =poll= command is: =xdp-filter poll [options]= Where the supported options are: ** -i, --interval The polling interval, in milliseconds. Defaults to 1000 (1 second). ** -v, --verbose Enable debug logging. Specify twice for even more verbosity. ** -h, --help Display a summary of the available options * Examples To filter all packets arriving on port 80 on eth0, issue the following commands: #+begin_src sh # xdp-filter load eth0 -f tcp,udp # xdp-filter port 80 #+end_src To filter all packets *except* those from IP address fc00:dead:cafe::1 issue the following commands (careful, this can lock you out of remote access!): #+begin_src sh # xdp-filter load eth0 -f ipv6 -p deny # xdp-filter ip fc00:dead:cafe::1 -m src #+end_src To allow packets from *either* IP fc00:dead:cafe::1 *or* arriving on port 22, issue the following (careful, this can lock you out of remote access!): #+begin_src sh # xdp-filter load eth0 -f ipv6,tcp -p deny # xdp-filter port 22 # xdp-filter ip fc00:dead:cafe::1 -m src #+end_src * BUGS Please report any bugs on Github: https://github.com/xdp-project/xdp-tools/issues * AUTHOR xdp-filter was written by Toke Høiland-Jørgensen and Jesper Dangaard Brouer. This man page was written by Toke Høiland-Jørgensen. xdp-tools-1.6.1/xdp-filter/common_kern_user.h000066400000000000000000000013031514310632100212350ustar00rootroot00000000000000#ifndef COMMON_KERN_USER_H #define COMMON_KERN_USER_H #define FEAT_TCP (1<<0) #define FEAT_UDP (1<<1) #define FEAT_IPV6 (1<<2) #define FEAT_IPV4 (1<<3) #define FEAT_ETHERNET (1<<4) #define FEAT_ALL (FEAT_TCP|FEAT_UDP|FEAT_IPV6|FEAT_IPV4|FEAT_ETHERNET) #define FEAT_ALLOW (1<<5) #define FEAT_DENY (1<<6) #define MAP_FLAG_SRC (1<<0) #define MAP_FLAG_DST (1<<1) #define MAP_FLAG_TCP (1<<2) #define MAP_FLAG_UDP (1<<3) #define MAP_FLAGS (MAP_FLAG_SRC|MAP_FLAG_DST|MAP_FLAG_TCP|MAP_FLAG_UDP) #define COUNTER_SHIFT 6 #define MAP_NAME_PORTS filter_ports #define MAP_NAME_IPV4 filter_ipv4 #define MAP_NAME_IPV6 filter_ipv6 #define MAP_NAME_ETHERNET filter_ethernet #include "xdp/xdp_stats_kern_user.h" #endif xdp-tools-1.6.1/xdp-filter/extract_features.sh000066400000000000000000000011151514310632100214240ustar00rootroot00000000000000#!/bin/sh cat</dev/null) [ "$?" -ne "0" ] && continue found=0 for w in $featstring; do if [ "$w" = "0x00000000" ]; then found=1 else if [ "$found" -eq "1" ]; then feats=$w break fi fi done echo " {\"$f\", 0x$feats}," done cat<&1) ret=$? if [ "$ret" -ne "0" ]; then return $ret fi echo "$output" regex="Found prog '$prog'" if ! [[ $output =~ $regex ]]; then echo echo "Couldn't find '$regex' in output for feat $feat" >&2 return 1 fi check_run $XDP_FILTER unload $NS -v } test_load() { declare -a FEATS=(tcp udp ipv4 ipv6 ethernet all) declare -a PROGS_D=(xdpfilt_dny_tcp.o xdpfilt_dny_udp.o xdpfilt_dny_ip.o xdpfilt_dny_ip.o xdpfilt_dny_eth.o xdpfilt_dny_all.o) declare -a PROGS_A=(xdpfilt_alw_tcp.o xdpfilt_alw_udp.o xdpfilt_alw_ip.o xdpfilt_alw_ip.o xdpfilt_alw_eth.o xdpfilt_alw_all.o) local len=${#FEATS[@]} for (( i=0; i<$len; i++ )); do if ! try_feat ${FEATS[$i]} ${PROGS_A[$i]}; then return 1 fi if ! try_feat ${FEATS[$i]} ${PROGS_A[$i]} --mode skb; then return 1 fi if ! try_feat ${FEATS[$i]} ${PROGS_D[$i]} --policy deny; then return 1 fi if ! try_feat ${FEATS[$i]} ${PROGS_D[$i]} --policy deny --mode skb; then return 1 fi done if [ -d /sys/fs/bpf/xdp-filter ]; then die "/sys/fs/bpf/xdp-filter still exists!" fi } check_packet() { local filter="$1" local command="$2" local expect="$3" echo "Checking command '$command' filter '$filter'" PID=$(start_tcpdump tcpdump --immediate-mode -epni $NS "$filter") echo "Started listener as $PID" ns_exec bash -c "$command" output=$(stop_background $PID) echo "$output" if [[ "$expect" == "OK" ]]; then regex="[1-9] packets? captured" else regex="0 packets captured" fi if [[ "$output" =~ $regex ]]; then echo "Packet check $expect SUCCESS" return 0 else echo "Packet check $expect FAILURE" exit 1 fi } check_port() { local type=$1 local port=$2 local expect=$3 echo "$type port $port $expect" [[ "$type" == "tcp" ]] && command="echo test | socat - TCP6:[$OUTSIDE_IP6]:$port,connect-timeout=0.1" [[ "$type" == "udp" ]] && command="echo test | socat - UDP6:[$OUTSIDE_IP6]:$port" check_packet "$type dst port $port" "$command" $expect } test_ports_allow() { local TEST_PORT=10000 # default allow mode check_run $XDP_FILTER load -f udp,tcp $NS -v check_port tcp $TEST_PORT OK check_port udp $TEST_PORT OK check_run $XDP_FILTER port $TEST_PORT -v check_port tcp $TEST_PORT NOTOK check_port tcp $[TEST_PORT+1] OK check_port udp $TEST_PORT NOTOK check_port udp $[TEST_PORT+1] OK check_run $XDP_FILTER port -r $TEST_PORT -v check_port tcp $TEST_PORT OK check_port udp $TEST_PORT OK check_run $XDP_FILTER unload $NS -v } test_ports_deny() { local TEST_PORT=10000 # default deny mode check_run $XDP_FILTER load -p deny -f udp,tcp $NS -v check_port tcp $TEST_PORT NOTOK check_port udp $TEST_PORT NOTOK check_run $XDP_FILTER port $TEST_PORT -v check_port tcp $TEST_PORT OK check_port tcp $[TEST_PORT+1] NOTOK check_port udp $TEST_PORT OK check_port udp $[TEST_PORT+1] NOTOK check_run $XDP_FILTER port -r $TEST_PORT -v check_port tcp $TEST_PORT NOTOK check_port udp $TEST_PORT NOTOK check_run $XDP_FILTER unload $NS -v } check_ping6() { check_packet "dst $OUTSIDE_IP6" "$PING6 -W 0.1 -c 1 $OUTSIDE_IP6" $1 } check_ndisc6() { check_packet "icmp6" "ndisc6 -r 1 $OUTSIDE_IP6 -s $INSIDE_IP6 veth0" $1 } test_ipv6_allow() { check_ping6 OK check_run $XDP_FILTER load -f ipv6 $NS -v check_run $XDP_FILTER ip $OUTSIDE_IP6 check_ping6 NOTOK check_ndisc6 NOTOK check_run $XDP_FILTER ip -r $OUTSIDE_IP6 check_ping6 OK check_ndisc6 OK check_run $XDP_FILTER ip -m src $INSIDE_IP6 check_ping6 NOTOK check_run $XDP_FILTER ip -m src -r $INSIDE_IP6 check_ping6 OK check_run $XDP_FILTER unload $NS -v } test_ipv6_deny() { check_ping6 OK check_run $XDP_FILTER load -p deny -f ipv6 $NS -v check_run $XDP_FILTER ip $OUTSIDE_IP6 check_ping6 OK check_ndisc6 OK check_run $XDP_FILTER ip -r $OUTSIDE_IP6 check_ping6 NOTOK check_ndisc6 NOTOK check_run $XDP_FILTER ip -m src $INSIDE_IP6 check_ping6 OK check_run $XDP_FILTER ip -m src -r $INSIDE_IP6 check_ping6 NOTOK check_run $XDP_FILTER unload $NS -v } check_ping4() { check_packet "dst $OUTSIDE_IP4" "ping -W 0.1 -c 1 $OUTSIDE_IP4" $1 } check_arp() { check_packet "arp" "arping -c 1 -I veth0 $OUTSIDE_IP4" $1 } check_arp_src() { check_packet "arp" "arping -A -c 1 -I veth0 $INSIDE_IP4" $1 } test_ipv4_allow() { check_ping4 OK check_run $XDP_FILTER load -f ipv4 $NS -v check_arp OK check_arp_src OK check_run $XDP_FILTER ip $OUTSIDE_IP4 check_ping4 NOTOK check_arp NOTOK check_run $XDP_FILTER ip -r $OUTSIDE_IP4 check_ping4 OK check_arp OK check_run $XDP_FILTER ip -m src $INSIDE_IP4 check_ping4 NOTOK check_arp_src NOTOK check_run $XDP_FILTER ip -m src -r $INSIDE_IP4 check_ping4 OK check_arp_src OK check_run $XDP_FILTER unload $NS -v } test_ipv4_deny() { check_ping4 OK check_run $XDP_FILTER load -p deny -f ipv4 $NS -v check_run $XDP_FILTER ip $OUTSIDE_IP4 check_ping4 OK check_arp OK check_run $XDP_FILTER ip -r $OUTSIDE_IP4 check_ping4 NOTOK check_arp NOTOK check_run $XDP_FILTER ip -m src $INSIDE_IP4 check_ping4 OK check_arp_src OK check_run $XDP_FILTER ip -m src -r $INSIDE_IP4 check_ping4 NOTOK check_arp_src NOTOK check_run $XDP_FILTER unload $NS -v } test_ether_allow() { check_ping6 OK check_run $XDP_FILTER load -f ethernet $NS -v check_run $XDP_FILTER ether $OUTSIDE_MAC check_ping6 NOTOK check_run $XDP_FILTER ether -r $OUTSIDE_MAC check_ping6 OK check_run $XDP_FILTER ether -m src $INSIDE_MAC check_ping6 NOTOK check_run $XDP_FILTER ether -m src -r $INSIDE_MAC check_ping6 OK check_run $XDP_FILTER unload $NS -v } test_ether_deny() { check_ping6 OK check_run $XDP_FILTER load -p deny -f ethernet $NS -v check_run $XDP_FILTER ether $OUTSIDE_MAC check_ping6 OK check_run $XDP_FILTER ether -r $OUTSIDE_MAC check_ping6 NOTOK check_run $XDP_FILTER ether -m src $INSIDE_MAC check_ping6 OK check_run $XDP_FILTER ether -m src -r $INSIDE_MAC check_ping6 NOTOK check_run $XDP_FILTER unload $NS -v } check_status() { local match local output match="$1" output=$($XDP_FILTER status) if echo "$output" | grep -q $match; then echo "Output check for $match SUCCESS" return 0 else echo "Output check for $match FAILURE" echo "Output: $output" exit 1 fi } check_status_no_match() { local match local output match="$1" output=$($XDP_FILTER status) if echo "$output" | grep -q $match; then echo "Output check for no $match FAILURE" echo "Output: $output" exit 1 else echo "Output check for no $match SUCCESS" return 0 fi } test_print() { check_run $XDP_FILTER load $NS -v check_run $XDP_FILTER ether aa:bb:cc:dd:ee:ff check_status "aa:bb:cc:dd:ee:ff" check_run $XDP_FILTER ip 1.2.3.4 check_status "1.2.3.4" check_run $XDP_FILTER ip aa::bb check_status "aa::bb" check_run $XDP_FILTER port 100 check_status "100.*dst,tcp,udp" check_run $XDP_FILTER unload $NS -v } check_port_removal_from_all() { local command_options=$1 local expected_output=$2 local TEST_PORT=54321 check_run $XDP_FILTER port $TEST_PORT -p tcp,udp -m src,dst check_status "$TEST_PORT.*src,dst,tcp,udp" check_run $XDP_FILTER port $TEST_PORT $command_options -r if [[ -z "$expected_output" ]]; then check_status_no_match "$TEST_PORT" else check_status "$TEST_PORT.*$expected_output" fi } test_output_remove() { check_run $XDP_FILTER load $NS -v # Remove only one mode/proto. check_port_removal_from_all "-m src" "dst,tcp,udp" check_port_removal_from_all "-m dst" "src,tcp,udp" check_port_removal_from_all "-p udp" "src,dst,tcp" check_port_removal_from_all "-p tcp" "src,dst,udp" # Remove one from each. check_port_removal_from_all "-m src -p udp" "dst,tcp" check_port_removal_from_all "-m src -p tcp" "dst,udp" check_port_removal_from_all "-m dst -p udp" "src,tcp" check_port_removal_from_all "-m dst -p tcp" "src,udp" # Remove everything. check_port_removal_from_all "" "" check_port_removal_from_all "-m src,dst" "" check_port_removal_from_all "-p tcp,udp" "" check_port_removal_from_all "-m src,dst -p tcp,udp" "" check_run $XDP_FILTER unload $NS -v } get_python() { if [[ -z "${PYTHON:-}" ]]; then local -a possible=(python3 python) local -a available local found=0 for i in "${possible[@]}"; do PYTHON=$(which $i) if [[ $? -eq 0 ]]; then found=1 break fi done if [[ found -eq 0 ]]; then return 1 fi fi $PYTHON -c "import xdp_test_harness" &> /dev/null if [[ $? -ne 0 ]]; then # Libraries are not installed. return 1 fi echo "$PYTHON" } run_python_test() { local module="$1" local module_path local python module_path="$(realpath --relative-to=. "$TOOL_TESTS_DIR" | sed "s/\//./g")" if [[ $? -ne 0 ]] || [[ $module_path == "." ]]; then return "$SKIPPED_TEST" fi python="$(get_python)" if [[ $? -ne 0 ]]; then return "$SKIPPED_TEST" fi $python -m xdp_test_harness.runner client "$module_path"."$module" if [[ $? -ne 0 ]]; then return 1 fi return 0 } test_python_basic() { run_python_test test_basic } test_python_slow() { run_python_test test_slow } cleanup_tests() { $XDP_FILTER unload $NS >/dev/null 2>&1 $XDP_LOADER unload $NS --all >/dev/null 2>&1 } xdp-tools-1.6.1/xdp-filter/tests/test_basic.py000066400000000000000000000207241514310632100213630ustar00rootroot00000000000000import subprocess import os import signal import unittest import scapy from scapy.all import (Ether, Packet, IP, IPv6, Raw, UDP, TCP, IPv6ExtHdrRouting) from xdp_test_harness.xdp_case import XDPCase, usingCustomLoader from xdp_test_harness.utils import XDPFlag from . common import XDP_FILTER, Base, get_mode_string @usingCustomLoader class LoadUnload(XDPCase): def setUp(self): self.msg = "WARNING: All tests that follow will likely provide false result.\n" def run_wrap(self, cmd): r = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.msg += "command: '" + str(cmd) + "'\n" self.msg += "stdout: '" + r.stdout.decode().strip() + "'\n" if r.stderr is not None: self.msg += "stderr: '" + r.stderr.decode().strip() + "'\n" self.msg += "\n" return r.returncode == 0 def load(self, mode=None): return self.run_wrap([ XDP_FILTER, "load", self.get_contexts().get_local_main().iface, "--verbose", "--mode", get_mode_string( mode if mode else self.get_contexts().get_local_main().xdp_mode ) ]) def unload(self): return self.run_wrap([ XDP_FILTER, "unload", self.get_contexts().get_local_main().iface, "--verbose" ]) def test_load_once(self): self.assertFalse(self.unload(), self.msg) self.assertTrue(self.load(), self.msg) self.assertTrue(self.unload(), self.msg) self.assertFalse(self.unload(), self.msg) def test_load_twice(self): self.assertFalse(self.unload(), self.msg) self.assertTrue(self.load(), self.msg) self.assertFalse(self.load(), self.msg) self.assertTrue(self.unload(), self.msg) self.assertFalse(self.unload(), self.msg) def test_load_hw(self): self.assertFalse(self.unload(), self.msg) self.load(mode=XDPFlag.HW_MODE), self.msg self.unload(), self.msg self.assertFalse(self.unload(), self.msg) class DirectBase: def drop_generic(self, address, target, use_inet6=False): to_send = self.to_send6 if use_inet6 else self.to_send self.arrived(to_send, self.send_packets(to_send)) subprocess.run([XDP_FILTER, target, address, "--mode", self.get_mode()]) self.not_arrived(to_send, self.send_packets(to_send)) subprocess.run([XDP_FILTER, target, address, "--mode", self.get_mode(), "--remove"]) self.arrived(to_send, self.send_packets(to_send)) def test_none_specified(self): self.arrived(self.to_send, self.send_packets(self.to_send)) def test_ether(self): self.drop_generic(self.get_device().ether, "ether") def test_ip(self): self.drop_generic(self.get_device().inet, "ip") def test_port(self): self.drop_generic(str(self.get_port()), "port") @unittest.skipIf(XDPCase.get_contexts().get_local_main().inet6 is None or XDPCase.get_contexts().get_remote_main().inet6 is None, "no inet6 address available") def test_ipv6(self): self.drop_generic(self.get_device().inet6, "ip", use_inet6=True) class BaseSrc: def get_device(self): return self.get_contexts().get_remote_main() def get_port(self): return self.src_port def get_mode(self): return "src" class BaseDst: def get_device(self): return self.get_contexts().get_local_main() def get_port(self): return self.dst_port def get_mode(self): return "dst" class BaseInvert: def setUp(self): subprocess.run([ XDP_FILTER, "load", "--policy", "deny", self.get_contexts().get_local_main().iface, "--mode", get_mode_string( self.get_contexts().get_local_main().xdp_mode ) ]) arrived = Base.not_arrived not_arrived = Base.arrived class DirectDropSrc(Base, DirectBase, BaseSrc): pass class DirectPassSrc(Base, DirectBase, BaseSrc, BaseInvert): pass class DirectDropDst(Base, DirectBase, BaseDst): pass class DirectPassDst(Base, DirectBase, BaseDst, BaseInvert): pass class IPv6ExtensionHeader(Base): def generic(self, extensions): packets = [Ether() / IPv6() / extensions / UDP(dport=55555)] * 5 self.arrived(packets, self.send_packets(packets)) subprocess.run([XDP_FILTER, "port", "55555", "--mode", "dst"]) self.not_arrived(packets, self.send_packets(packets)) subprocess.run([XDP_FILTER, "port", "55555", "--mode", "dst", "--remove"]) self.arrived(packets, self.send_packets(packets)) def test_routing(self): self.generic(scapy.layers.inet6.IPv6ExtHdrRouting()) def test_hop_by_hop(self): self.generic(scapy.layers.inet6.IPv6ExtHdrHopByHop()) def test_destination_options(self): self.generic(scapy.layers.inet6.IPv6ExtHdrDestOpt()) def test_fragment(self): self.generic(scapy.layers.inet6.IPv6ExtHdrFragment()) class IPv4ToIPv6Mapping(Base): def setUp(self): super().setUp() inet = self.get_contexts().get_local_main().inet self.address_explicit = "::ffff:" + inet inet6_split = [format(int(i), "02x") for i in inet.split(".")] self.address_converted = "::ffff:" + \ inet6_split[0] + inet6_split[1] + ":" + \ inet6_split[2] + inet6_split[3] self.packets = self.generate_default_packets( dst_inet=self.address_explicit, use_inet6=True) self.packets += self.generate_default_packets( dst_inet=self.address_converted, use_inet6=True) def test_filter_explicit_address(self): self.arrived(self.packets, self.send_packets(self.packets)) subprocess.run([XDP_FILTER, "ip", self.address_explicit, "--mode", "dst"]) self.not_arrived(self.packets, self.send_packets(self.packets)) subprocess.run([XDP_FILTER, "ip", self.address_explicit, "--mode", "dst", "--remove"]) self.arrived(self.packets, self.send_packets(self.packets)) def test_filter_converted_address(self): self.arrived(self.packets, self.send_packets(self.packets)) subprocess.run([XDP_FILTER, "ip", self.address_converted, "--mode", "dst"]) self.not_arrived(self.packets, self.send_packets(self.packets)) subprocess.run([XDP_FILTER, "ip", self.address_converted, "--mode", "dst", "--remove"]) self.arrived(self.packets, self.send_packets(self.packets)) class Status(Base): def setUp(self): pass def load(self, features): return subprocess.run([ XDP_FILTER, "load", self.get_contexts().get_local_main().iface, "--mode", get_mode_string( self.get_contexts().get_local_main().xdp_mode ), "--features", features, ]) def get_status(self): return subprocess.run( [XDP_FILTER, "status"], capture_output=True ).stdout.decode() def test_ethernet_feature(self): self.load("ethernet") self.check_status("ether", self.get_contexts().get_local_main().ether) def test_ipv4_feature(self): self.load("ipv4") self.check_status("ip", self.get_contexts().get_local_main().inet) def test_udp_feature(self): self.load("udp") self.check_status("port", str(self.dst_port)) def test_all_features(self): self.load("all") self.check_status("ether", self.get_contexts().get_local_main().ether) self.check_status("ip", self.get_contexts().get_local_main().inet) self.check_status("port", str(self.dst_port)) def check_status(self, subcommand, address): self.assertEqual(self.get_status().find(address), -1) subprocess.run([XDP_FILTER, subcommand, address]) self.assertNotEqual(self.get_status().find(address), -1) subprocess.run([XDP_FILTER, subcommand, address, "--remove"]) self.assertEqual(self.get_status().find(address), -1) xdp-tools-1.6.1/xdp-filter/tests/test_slow.py000066400000000000000000000100401514310632100212540ustar00rootroot00000000000000import subprocess import os import signal import unittest from scapy.all import (Ether, Packet, IP, IPv6, Raw, UDP, TCP, IPv6ExtHdrRouting) from xdp_test_harness.xdp_case import XDPCase, usingCustomLoader from xdp_test_harness.utils import XDPFlag from . common import Base, XDP_FILTER, get_mode_string class ManyAddresses(Base): def format_number(self, number, delimiter, format_string, part_size, parts_amount): splitted = [] while number > 0: splitted.append(int(number % (1 << part_size))) number = number >> part_size assert(len(splitted) <= parts_amount) if (len(splitted) < parts_amount): splitted += [0] * (parts_amount - len(splitted)) splitted.reverse() return delimiter.join(format(s, format_string) for s in splitted) def generate_addresses(self, delimiter, format_string, parts_amount, full_size): AMOUNT = 257 bits = parts_amount * full_size for gen_number in range(0, (1 << bits) - 1, int((1 << bits) / AMOUNT)): yield self.format_number(gen_number, delimiter, format_string, parts_amount, full_size) def filter_addresses(self, name, delimiter, format_string, parts_amount, full_size): summed = 0 for address in self.generate_addresses(delimiter, format_string, parts_amount, full_size): summed += 1 subprocess.run([XDP_FILTER, name, address, "--mode", "dst"]) output = subprocess.check_output([XDP_FILTER, "status"]) # Each address is on a separate line. self.assertGreaterEqual(len(output.splitlines()), summed) def get_invalid_address(self, name, delimiter, format_string, parts_amount, full_size): """ Try to add addresses to xdp-filter, return address that does not get added. """ last_length = subprocess.check_output([XDP_FILTER, "status"]) for address in self.generate_addresses(delimiter, format_string, parts_amount, full_size): new_length = subprocess.check_output( [XDP_FILTER, name, address, "--mode", "dst", "--status"]) if new_length == last_length: return address last_length = new_length return None def test_ip_arrive(self): missing = self.get_invalid_address("ip", ".", "d", 8, 4) if missing is None: return to_send = self.generate_default_packets(dst_inet=missing) res = self.send_packets(to_send) self.not_arrived(to_send, res) def test_ether_arrive(self): missing = self.get_invalid_address("ether", ":", "02x", 8, 6) if missing is None: return to_send = self.generate_default_packets(dst_ether=missing) res = self.send_packets(to_send) self.not_arrived(to_send, res) def test_port_arrive(self): missing = self.get_invalid_address("port", "", "d", 16, 1) if missing is None: return to_send = self.generate_default_packets(dst_port=missing) res = self.send_packets(to_send) self.not_arrived(to_send, res) def test_ip_status(self): self.filter_addresses("ip", ".", "d", 8, 4) def test_port_status(self): self.filter_addresses("port", "", "d", 16, 1) def test_ether_status(self): self.filter_addresses("ether", ":", "02x", 8, 6) class ManyAddressesInverted(ManyAddresses): def setUp(self): subprocess.run([ XDP_FILTER, "load", "--policy", "deny", self.get_contexts().get_local_main().iface, "--mode", get_mode_string( self.get_contexts().get_local_main().xdp_mode ) ]) arrived = Base.not_arrived not_arrived = Base.arrived xdp-tools-1.6.1/xdp-filter/xdp-filter.8000066400000000000000000000257211514310632100177000ustar00rootroot00000000000000.TH "xdp-filter" "8" "SEPTEMBER 5, 2022" "V1.6.1" "A simple XDP-powered packet filter" .SH "NAME" xdp-filter \- a simple XDP-powered packet filter .SH "SYNOPSIS" .PP XDP-filter is a packet filtering utility powered by XDP. It is deliberately simple and so does not have the same matching capabilities as, e.g., netfilter. Instead, thanks to XDP, it can achieve very high drop rates: tens of millions of packets per second on a single CPU core. .SS "Running xdp-filter" .PP The syntax for running xdp-filter is: .RS .nf \fCxdp-filter COMMAND [options] Where COMMAND can be one of: load - load xdp-filter on an interface unload - unload xdp-filter from an interface port - add a port to the filter list ip - add an IP address to the filter list ether - add an Ethernet MAC address to the filter list status - show current xdp-filter status poll - poll statistics output help - show the list of available commands \fP .fi .RE .PP Each command, and its options are explained below. Or use \fIxdp\-filter COMMAND \-\-help\fP to see the options for each command. .SH "The LOAD command" .PP To use \fIxdp\-filter\fP, it must first be loaded onto an interface. This is accomplished with the \fIload\fP command, which takes the name of the interface as a parameter, and optionally allows specifying the features that should be included. By default all features are loaded, but de-selecting some features can speed up the packet matching, and increase performance by a substantial amount. .PP The syntax for the \fIload\fP command is: .PP \fIxdp\-filter load [options] \fP .PP Where \fI\fP is the name of the interface to load \fIxdp\-filter\fP onto, and must be specified. The supported options are: .SS "-m, --mode " .PP Specifies which mode to load the XDP program to be loaded in. The valid values are 'native', which is the default in-driver XDP mode, 'skb', which causes the so-called \fIskb mode\fP (also known as \fIgeneric XDP\fP) to be used, or 'hw' which causes the program to be offloaded to the hardware. .SS "-p, --policy " .PP This sets the policy \fIxdp\-filter\fP applies to packets \fBnot\fP matched by any of the filter rules. The default is \fIallow\fP, in which packets not matching any rules are allowed to pass. The other option is \fIdeny\fP, in which \fBall\fP packets are dropped \fBexcept\fP those matched by the filter options. .PP \fIxdp\-filter\fP cannot be loaded simultaneously in \fIdeny\fP and \fIallow\fP policy modes on the system. Note that loading \fIxdp\-filter\fP in \fIdeny\fP mode will drop all traffic on the interface until suitable allow rules are installed, so some care is needed to avoid being locked out of a remote system. .SS "-f, --features " .PP Use this option to select which features to include when loaded \fIxdp\-filter\fP. The default is to load all available features. So select individual features specify one or more of these: .IP \(bu 4 \fBtcp\fP: Support filtering on TCP port number .IP \(bu 4 \fBudp\fP: Support filtering on UDP port number .IP \(bu 4 \fBipv6\fP: Support filtering on IPv6 addresses .IP \(bu 4 \fBipv4\fP: Support filtering on IPv4 addresses .IP \(bu 4 \fBethernet\fP: Support filtering on Ethernet MAC addresses .PP Specify multiple features by separating them with a comma. E.g.: \fItcp,udp,ipv6\fP. .SS "-v, --verbose" .PP Enable debug logging. Specify twice for even more verbosity. .SS "-h, --help" .PP Display a summary of the available options .SH "The UNLOAD command" .PP The \fIunload\fP command unloads \fIxdp\-filter\fP from one (or all) interfaces, and cleans up the program state. .PP The syntax for the \fIload\fP command is: .PP \fIxdp\-filter unload [options] \fP .PP Where \fI\fP is the name of the interface to unload \fIxdp\-filter\fP from, and must be specified unless the \fB--all\fP option is used. The supported options are: .SS "-a, --all" .PP Specify this option to remove \fIxdp\-filter\fP from all interfaces it was loaded onto. If this option is specified, no \fI\fP is needed. .PP This option can also be used to clean up all \fIxdp\-filter\fP state if the XDP program(s) were unloaded by other means. .SS "-k, --keep-maps" .PP Specify this option to prevent \fIxdp\-filter\fP from clearing its map state. By default, all BPF maps no longer needed by any loaded program are removed. However, this will also remove the contents of the maps (the filtering rules), so this option can be used to keep the maps around so the rules persist until \fIxdp\-filter\fP is loaded again. .SS "-v, --verbose" .PP Enable debug logging. Specify twice for even more verbosity. .SS "-h, --help" .PP Display a summary of the available options .SH "The PORT command" .PP Use the \fIport\fP command to add a TCP or UDP port to the \fIxdp\-filter\fP match list. For this to work, \fIxdp\-filter\fP must be loaded with either the \fBudp\fP or the \fBtcp\fP feature (or both) on at least one interface. .PP The syntax for the \fIport\fP command is: .PP \fIxdp\-filter port [options] \fP .PP Where \fI\fP is the port number to add (or remove if the \fB--remove\fP is specified). The supported options are: .SS "-r, --remove" .PP Remove the port instead of adding it. .SS "-m, --mode " .PP Select filtering mode. Valid options are \fBsrc\fP and \fBdst\fP, both of which may be specified as \fIsrc,dst\fP. If \fBsrc\fP is specified, the port number will added as a \fIsource port\fP match, while if \fBdst\fP is specified, the port number will be added as a \fIdestination port\fP match. If both are specified, a packet will be matched if \fBeither\fP its source or destination port is the specified port number. .SS "-p, --proto " .PP Specify one (or both) of \fBudp\fP and/or \fBtcp\fP to match UDP or TCP ports, respectively. .SS "-s, --status" .PP If this option is specified, the current list of matched ports will be printed after inserting the port number. Otherwise, nothing will be printed. .SS "-v, --verbose" .PP Enable debug logging. Specify twice for even more verbosity. .SS "-h, --help" .PP Display a summary of the available options .SH "The IP command" .PP Use the \fIip\fP command to add an IPv6 or an IPv4 address to the \fIxdp\-filter\fP match list. .PP The syntax for the \fIip\fP command is: .PP \fIxdp\-filter ip [options] \fP .PP Where \fI\fP is the IP address to add (or remove if the \fB--remove\fP is specified). Either IPv4 or IPv6 addresses can be specified, but \fIxdp\-filter\fP must be loaded with the corresponding features (\fBipv4\fP and \fBipv6\fP, respectively). The supported options are: .SS "-r, --remove" .PP Remove the IP address instead of adding it. .SS "-m, --mode " .PP Select filtering mode. Valid options are \fBsrc\fP and \fBdst\fP, both of which may be specified as \fIsrc,dst\fP. If \fBsrc\fP is specified, the IP address will added as a \fIsource IP\fP match, while if \fBdst\fP is specified, the IP address will be added as a \fIdestination IP\fP match. If both are specified, a packet will be matched if \fBeither\fP its source or destination IP is the specified IP address. .SS "-s, --status" .PP If this option is specified, the current list of matched ips will be printed after inserting the IP address. Otherwise, nothing will be printed. .SS "-v, --verbose" .PP Enable debug logging. Specify twice for even more verbosity. .SS "-h, --help" .PP Display a summary of the available options .SH "The ETHER command" .PP Use the \fIether\fP command to add an Ethernet MAC address to the \fIxdp\-filter\fP match list. For this to work, \fIxdp\-filter\fP must be loaded with either the \fBethernet\fP feature on at least one interface. .PP The syntax for the \fIether\fP command is: .PP \fIxdp\-filter ether [options] \fP .PP Where \fI\fP is the MAC address to add (or remove if the \fB--remove\fP is specified). The supported options are: .SS "-r, --remove" .PP Remove the MAC address instead of adding it. .SS "-m, --mode " .PP Select filtering mode. Valid options are \fBsrc\fP and \fBdst\fP, both of which may be specified as \fIsrc,dst\fP. If \fBsrc\fP is specified, the MAC address will added as a \fIsource MAC\fP match, while if \fBdst\fP is specified, the MAC address will be added as a \fIdestination MAC\fP match. If both are specified, a packet will be matched if \fBeither\fP its source or destination MAC is the specified MAC address. .SS "-s, --status" .PP If this option is specified, the current list of matched ips will be printed after inserting the MAC address. Otherwise, nothing will be printed. .SS "-v, --verbose" .PP Enable debug logging. Specify twice for even more verbosity. .SS "-h, --help" .PP Display a summary of the available options .SH "The STATUS command" .PP The \fIstatus\fP command prints the current status of \fIxdp\-filter\fP: Which interfaces it is loaded on, the current list of rules, and some statistics for how many packets have been processed in total, and how many times each rule has been hit. .PP The syntax for the \fIstatus\fP command is: .PP \fIxdp\-filter status [options]\fP .PP Where the supported options are: .SS "-v, --verbose" .PP Enable debug logging. Specify twice for even more verbosity. .SS "-h, --help" .PP Display a summary of the available options .SH "The POLL command" .PP The \fIpoll\fP command periodically polls the \fIxdp\-filter\fP statistics map and prints out the total number of packets and bytes processed by \fIxdp\-filter\fP, as well as the number in the last polling interval, converted to packets (and bytes) per second. This can be used to inspect the performance of \fIxdp\-filter\fP, and to compare the performance of the different feature sets selectable by the \fIload\fP parameter. .PP The syntax for the \fIpoll\fP command is: .PP \fIxdp\-filter poll [options]\fP .PP Where the supported options are: .SS "-i, --interval " .PP The polling interval, in milliseconds. Defaults to 1000 (1 second). .SS "-v, --verbose" .PP Enable debug logging. Specify twice for even more verbosity. .SS "-h, --help" .PP Display a summary of the available options .SH "Examples" .PP To filter all packets arriving on port 80 on eth0, issue the following commands: .RS .nf \fC# xdp-filter load eth0 -f tcp,udp # xdp-filter port 80 \fP .fi .RE .PP To filter all packets \fBexcept\fP those from IP address fc00:dead:cafe::1 issue the following commands (careful, this can lock you out of remote access!): .RS .nf \fC# xdp-filter load eth0 -f ipv6 -p deny # xdp-filter ip fc00:dead:cafe::1 -m src \fP .fi .RE .PP To allow packets from \fBeither\fP IP fc00:dead:cafe::1 \fBor\fP arriving on port 22, issue the following (careful, this can lock you out of remote access!): .RS .nf \fC# xdp-filter load eth0 -f ipv6,tcp -p deny # xdp-filter port 22 # xdp-filter ip fc00:dead:cafe::1 -m src \fP .fi .RE .SH "BUGS" .PP Please report any bugs on Github: \fIhttps://github.com/xdp-project/xdp-tools/issues\fP .SH "AUTHOR" .PP xdp-filter was written by Toke Høiland-Jørgensen and Jesper Dangaard Brouer. This man page was written by Toke Høiland-Jørgensen. xdp-tools-1.6.1/xdp-filter/xdp-filter.c000066400000000000000000000656401514310632100177570ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "params.h" #include "logging.h" #include "util.h" #include "stats.h" #include "common_kern_user.h" #include "prog_features.h" #define NEED_RLIMIT (20 * 1024 * 1024) /* 10 Mbyte */ #define PROG_NAME "xdp-filter" struct flag_val map_flags_all[] = { {"src", MAP_FLAG_SRC}, {"dst", MAP_FLAG_DST}, {"tcp", MAP_FLAG_TCP}, {"udp", MAP_FLAG_UDP}, {} }; struct flag_val map_flags_srcdst[] = { {"src", MAP_FLAG_SRC}, {"dst", MAP_FLAG_DST}, {} }; struct flag_val map_flags_tcpudp[] = { {"tcp", MAP_FLAG_TCP}, {"udp", MAP_FLAG_UDP}, {} }; static char *find_prog_file(__u32 features) { struct prog_feature *feat; if (!features) return NULL; for (feat = prog_features; feat->prog_name; feat++) { if ((ntohl(feat->features) & features) == features) return strdup(feat->prog_name); } return NULL; } static __u32 find_features(const char *progname) { struct prog_feature *feat; for (feat = prog_features; feat->prog_name; feat++) { if (is_prefix(progname, feat->prog_name)) return ntohl(feat->features); } return 0; } static int map_get_counter_flags(int fd, void *key, __u64 *counter, __u8 *flags) { /* For percpu maps, userspace gets a value per possible CPU */ int nr_cpus = libbpf_num_possible_cpus(); __u64 sum_ctr = 0; int i, err = 0; __u64 *values; if (nr_cpus < 0) return nr_cpus; values = calloc(nr_cpus, sizeof(*values)); if (!values) return -ENOMEM; if ((bpf_map_lookup_elem(fd, key, values)) != 0) { err = -ENOENT; goto out; } /* Sum values from each CPU */ for (i = 0; i < nr_cpus; i++) { __u8 flg = values[i] & MAP_FLAGS; if (!flg) { err = -ENOENT; /* not set */ goto out; } *flags = flg; sum_ctr += values[i] >> COUNTER_SHIFT; } *counter = sum_ctr; out: free(values); return err; } static int map_set_flags(int fd, void *key, __u8 flags, bool delete_empty) { /* For percpu maps, userspace gets a value per possible CPU */ int nr_cpus = libbpf_num_possible_cpus(); __u64 *values; int i, err; if (nr_cpus < 0) return nr_cpus; values = calloc(nr_cpus, sizeof(*values)); if (!values) return -ENOMEM; if (bpf_map_lookup_elem(fd, key, values) != 0) { memset(values, 0, sizeof(*values) * nr_cpus); } else if (!flags && delete_empty) { pr_debug("Deleting empty map value from flags %u\n", flags); err = bpf_map_delete_elem(fd, key); if (err) { err = -errno; pr_warn("Couldn't delete value from state map: %s\n", strerror(-err)); } goto out; } for (i = 0; i < nr_cpus; i++) values[i] = flags ? (values[i] & ~MAP_FLAGS) | (flags & MAP_FLAGS) : 0; pr_debug("Setting new map value %" PRIu64 " from flags %u\n", (uint64_t)values[0], flags); err = bpf_map_update_elem(fd, key, values, 0); if (err) { err = -errno; if (err == -E2BIG) pr_warn("Couldn't add entry: state map is full\n"); else pr_warn("Unable to update state map: %s\n", strerror(-err)); } out: free(values); return err; } static int get_iface_features(__unused const struct iface *iface, struct xdp_program *prog, __unused enum xdp_attach_mode mode, void *arg) { __u32 *all_feats = arg; *all_feats |= find_features(xdp_program__name(prog)); return 0; } static int get_used_features(const char *pin_root_path, __u32 *feats) { __u32 all_feats = 0; int err; err = iterate_pinned_programs(pin_root_path, get_iface_features, &all_feats); if (err && err != -ENOENT) return err; *feats = all_feats; return 0; } static const struct loadopt { bool help; struct iface iface; unsigned int features; enum xdp_attach_mode mode; unsigned int policy_mode; } defaults_load = { .features = FEAT_ALL, .mode = XDP_MODE_NATIVE, .policy_mode = FEAT_ALLOW, }; struct flag_val load_features[] = { {"tcp", FEAT_TCP}, {"udp", FEAT_UDP}, {"ipv6", FEAT_IPV6}, {"ipv4", FEAT_IPV4}, {"ethernet", FEAT_ETHERNET}, {"all", FEAT_ALL}, {} }; struct flag_val print_features[] = { {"tcp", FEAT_TCP}, {"udp", FEAT_UDP}, {"ipv6", FEAT_IPV6}, {"ipv4", FEAT_IPV4}, {"ethernet", FEAT_ETHERNET}, {"allow", FEAT_ALLOW}, {"deny", FEAT_DENY}, {} }; struct enum_val xdp_modes[] = { {"native", XDP_MODE_NATIVE}, {"skb", XDP_MODE_SKB}, {"hw", XDP_MODE_HW}, {NULL, 0} }; struct enum_val policy_modes[] = { {"allow", FEAT_ALLOW}, {"deny", FEAT_DENY}, {NULL, 0} }; static struct prog_option load_options[] = { DEFINE_OPTION("mode", OPT_ENUM, struct loadopt, mode, .short_opt = 'm', .typearg = xdp_modes, .metavar = "", .help = "Load XDP program in ; default native"), DEFINE_OPTION("policy", OPT_ENUM, struct loadopt, policy_mode, .short_opt = 'p', .typearg = policy_modes, .metavar = "", .help = "Policy for unmatched packets; default allow"), DEFINE_OPTION("dev", OPT_IFNAME, struct loadopt, iface, .positional = true, .metavar = "", .required = true, .help = "Load on device "), DEFINE_OPTION("features", OPT_FLAGS, struct loadopt, features, .short_opt = 'f', .metavar = "", .typearg = load_features, .help = "Features to enable; default all"), END_OPTIONS }; int do_load(const void *cfg, const char *pin_root_path) { char errmsg[STRERR_BUFSIZE], featbuf[100]; const struct loadopt *opt = cfg; int err = EXIT_SUCCESS, lock_fd; struct xdp_program *p = NULL; unsigned int features; char *filename = NULL; __u32 used_feats; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, .pin_root_path = pin_root_path); DECLARE_LIBXDP_OPTS(xdp_program_opts, xdp_opts, 0); if (opt->mode == XDP_MODE_HW) { pr_warn("xdp-filter does not support offloading.\n"); return EXIT_FAILURE; } lock_fd = prog_lock_acquire(pin_root_path); if (lock_fd < 0) return lock_fd; err = get_used_features(pin_root_path, &used_feats); if (err) { pr_warn("Error getting list of loaded programs: %s\n", strerror(-err)); goto out; } err = EXIT_FAILURE; features = opt->features; if (opt->policy_mode == FEAT_DENY && used_feats & FEAT_ALLOW) { pr_warn("xdp-filter is already loaded in allow policy mode. " "Unload before loading in deny mode.\n"); goto out; } else if (opt->policy_mode == FEAT_ALLOW && used_feats & FEAT_DENY) { pr_warn("xdp-filter is already loaded in deny policy mode. " "Unload before loading in allow mode.\n"); goto out; } features |= opt->policy_mode; err = get_pinned_program(&opt->iface, pin_root_path, NULL, &p); if (!err) { pr_warn("xdp-filter is already loaded on %s\n", opt->iface.ifname); xdp_program__close(p); goto out; } print_flags(featbuf, sizeof(featbuf), print_features, features); pr_debug("Looking for eBPF program with features %s\n", featbuf); filename = find_prog_file(features); if (!filename) { pr_warn("Couldn't find an eBPF program with the requested feature set!\n"); goto out; } pr_debug("Found prog '%s' matching feature set to be loaded on interface '%s'.\n", filename, opt->iface.ifname); /* libbpf spits out a lot of unhelpful error messages while loading. * Silence the logging so we can provide our own messages instead; this * is a noop if verbose logging is enabled. */ silence_libbpf_logging(); retry: xdp_opts.find_filename = filename; xdp_opts.opts = &opts; /* prog_name is NULL, so choose the first program in object */ p = xdp_program__create(&xdp_opts); err = libxdp_get_error(p); if (err) { if (err == -EPERM && !double_rlimit()) goto retry; libxdp_strerror(err, errmsg, sizeof(errmsg)); pr_warn("Couldn't load BPF program: %s(%d)\n", errmsg, err); p = NULL; goto out; } err = attach_xdp_program(p, &opt->iface, opt->mode, pin_root_path); if (err) { if (err == -EPERM && !double_rlimit()) { xdp_program__close(p); goto retry; } libxdp_strerror(err, errmsg, sizeof(errmsg)); pr_warn("Couldn't attach XDP program on iface '%s': %s(%d)\n", opt->iface.ifname, errmsg, err); goto out; } out: xdp_program__close(p); free(filename); prog_lock_release(lock_fd); return err; } static int remove_unused_maps(const char *pin_root_path, __u32 features) { int dir_fd, err = 0; dir_fd = open(pin_root_path, O_DIRECTORY); if (dir_fd < 0) { if (errno == ENOENT) return 0; err = -errno; pr_warn("Unable to open pin directory %s: %s\n", pin_root_path, strerror(-err)); goto out; } if (!(features & (FEAT_TCP | FEAT_UDP))) { err = unlink_pinned_map(dir_fd, textify(MAP_NAME_PORTS)); if (err) goto out; } if (!(features & FEAT_IPV4)) { err = unlink_pinned_map(dir_fd, textify(MAP_NAME_IPV4)); if (err) goto out; } if (!(features & FEAT_IPV6)) { err = unlink_pinned_map(dir_fd, textify(MAP_NAME_IPV6)); if (err) goto out; } if (!(features & FEAT_ETHERNET)) { err = unlink_pinned_map(dir_fd, textify(MAP_NAME_ETHERNET)); if (err) goto out; } if (!features) { char buf[PATH_MAX]; err = unlink_pinned_map(dir_fd, textify(XDP_STATS_MAP_NAME)); if (err) goto out; close(dir_fd); dir_fd = -1; err = try_snprintf(buf, sizeof(buf), "%s/%s", pin_root_path, "programs"); if (err) goto out; pr_debug("Removing program directory %s\n", buf); err = rmdir(buf); if (err) { err = -errno; pr_warn("Unable to rmdir: %s\n", strerror(-err)); goto out; } pr_debug("Removing pinning directory %s\n", pin_root_path); err = rmdir(pin_root_path); if (err) { err = -errno; pr_warn("Unable to rmdir: %s\n", strerror(-err)); goto out; } } out: if (dir_fd >= 0) close(dir_fd); return err; } static int remove_iface_program(const struct iface *iface, struct xdp_program *prog, enum xdp_attach_mode mode, void *arg) { char errmsg[STRERR_BUFSIZE], buf[100]; char *pin_root_path = arg; __u32 feats; int err; feats = find_features(xdp_program__name(prog)); if (!feats) { pr_warn("Unrecognised XDP program on interface %s. Not removing.\n", iface->ifname); return -ENOENT; } print_flags(buf, sizeof(buf), print_features, feats); pr_debug("Removing XDP program with features %s from iface %s\n", buf, iface->ifname); err = detach_xdp_program(prog, iface, mode, pin_root_path); if (err) { libxdp_strerror(err, errmsg, sizeof(errmsg)); pr_warn("Removing XDP program on iface %s failed (%d): %s\n", iface->ifname, -err, errmsg); } return err; } static const struct unloadopt { bool all; bool keep; struct iface iface; } defaults_unload = {}; static struct prog_option unload_options[] = { DEFINE_OPTION("dev", OPT_IFNAME, struct unloadopt, iface, .positional = true, .metavar = "", .help = "Unload from device "), DEFINE_OPTION("all", OPT_BOOL, struct unloadopt, all, .short_opt = 'a', .help = "Unload from all interfaces"), DEFINE_OPTION("keep-maps", OPT_BOOL, struct unloadopt, keep, .short_opt = 'k', .help = "Don't destroy unused maps after unloading"), END_OPTIONS }; int do_unload(const void *cfg, const char *pin_root_path) { const struct unloadopt *opt = cfg; int err = EXIT_SUCCESS, lock_fd; enum xdp_attach_mode mode; struct xdp_program *prog; char buf[100]; __u32 feats; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, .pin_root_path = pin_root_path); lock_fd = prog_lock_acquire(pin_root_path); if (lock_fd < 0) return lock_fd; if (opt->all) { pr_debug("Removing xdp-filter from all interfaces\n"); err = iterate_pinned_programs(pin_root_path, remove_iface_program, (void *)pin_root_path); if (err && err != -ENOENT) goto out; goto clean_maps; } if (!opt->iface.ifindex) { pr_warn("Must specify ifname or --all\n"); err = EXIT_FAILURE; goto out; } err = get_pinned_program(&opt->iface, pin_root_path, &mode, &prog); if (err) { pr_warn("xdp-filter is not loaded on %s\n", opt->iface.ifname); err = EXIT_FAILURE; goto out; } err = remove_iface_program(&opt->iface, prog, mode, (void *)pin_root_path); if (err) goto out; clean_maps: if (opt->keep) { pr_debug("Not removing pinned maps because of --keep-maps option\n"); goto out; } pr_debug("Checking map usage and removing unused maps\n"); err = get_used_features(pin_root_path, &feats); if (err) goto out; print_flags(buf, sizeof(buf), print_features, feats); pr_debug("Features still being used: %s\n", feats ? buf : "none"); err = remove_unused_maps(pin_root_path, feats); if (err) goto out; out: prog_lock_release(lock_fd); return err; } int print_ports(int map_fd) { __u32 map_key = -1, prev_key = 0; int err; printf("Filtered ports:\n"); printf(" %-40s Mode Hit counter\n", ""); FOR_EACH_MAP_KEY (err, map_fd, map_key, prev_key) { char buf[100]; __u64 counter; __u8 flags = 0; err = map_get_counter_flags(map_fd, &map_key, &counter, &flags); if (err == -ENOENT) continue; else if (err) return err; print_flags(buf, sizeof(buf), map_flags_all, flags); printf(" %-40u %-15s %" PRIu64 "\n", ntohs(map_key), buf, (uint64_t)counter); } return 0; } static const struct portopt { unsigned int mode; unsigned int proto; __u16 port; bool print_status; bool remove; } defaults_port = {}; static struct prog_option port_options[] = { DEFINE_OPTION("port", OPT_U16, struct portopt, port, .positional = true, .metavar = "", .required = true, .help = "Port to add or remove"), DEFINE_OPTION("remove", OPT_BOOL, struct portopt, remove, .short_opt = 'r', .help = "Remove port instead of adding"), DEFINE_OPTION("mode", OPT_FLAGS, struct portopt, mode, .short_opt = 'm', .metavar = "", .typearg = map_flags_srcdst, .help = "Filter mode; default dst"), DEFINE_OPTION("proto", OPT_FLAGS, struct portopt, proto, .short_opt = 'p', .metavar = "", .typearg = map_flags_tcpudp, .help = "Protocol to filter; default tcp,udp"), DEFINE_OPTION("status", OPT_BOOL, struct portopt, print_status, .short_opt = 's', .help = "Print status of filtered ports after changing"), END_OPTIONS }; int do_port(const void *cfg, const char *pin_root_path) { int map_fd = -1, err = EXIT_SUCCESS, lock_fd; char modestr[100], protostr[100]; const struct portopt *opt = cfg; unsigned int proto = opt->proto; unsigned int mode = opt->mode; struct bpf_map_info info = {}; __u8 flags = 0; __u64 counter; __u32 map_key; lock_fd = prog_lock_acquire(pin_root_path); if (lock_fd < 0) return lock_fd; map_fd = get_pinned_map_fd(pin_root_path, textify(MAP_NAME_PORTS), &info); if (map_fd < 0) { pr_warn("Couldn't find port filter map; is xdp-filter loaded " "with the right features (udp and/or tcp)?\n"); err = EXIT_FAILURE; goto out; } pr_debug("Found map with fd %d for map id %d\n", map_fd, info.id); map_key = htons(opt->port); err = map_get_counter_flags(map_fd, &map_key, &counter, &flags); if (err && err != -ENOENT) goto out; if (opt->remove) { if (mode == 0 && proto == 0) { mode = MAP_FLAG_SRC | MAP_FLAG_DST; proto = MAP_FLAG_TCP | MAP_FLAG_UDP; } flags &= ~(mode | proto); } else { if (mode == 0) mode = MAP_FLAG_DST; if (proto == 0) proto = MAP_FLAG_TCP | MAP_FLAG_UDP; flags |= mode | proto; } print_flags(modestr, sizeof(modestr), map_flags_srcdst, mode); print_flags(protostr, sizeof(protostr), map_flags_tcpudp, proto); pr_debug("%s %s port %u mode %s\n", opt->remove ? "Removing" : "Adding", protostr, opt->port, modestr); if (!(flags & (MAP_FLAG_DST | MAP_FLAG_SRC)) || !(flags & (MAP_FLAG_TCP | MAP_FLAG_UDP))) flags = 0; err = map_set_flags(map_fd, &map_key, flags, false); if (err) goto out; if (opt->print_status) { err = print_ports(map_fd); if (err) goto out; } out: if (map_fd >= 0) close(map_fd); prog_lock_release(lock_fd); return err; } int __print_ips(int map_fd, int af) { struct ip_addr map_key = { .af = af }, prev_key = {}; int err; FOR_EACH_MAP_KEY (err, map_fd, map_key.addr, prev_key.addr) { char flagbuf[100], addrbuf[100]; __u8 flags = 0; __u64 counter; err = map_get_counter_flags(map_fd, &map_key.addr, &counter, &flags); if (err == -ENOENT) continue; else if (err) return err; print_flags(flagbuf, sizeof(flagbuf), map_flags_srcdst, flags); print_addr(addrbuf, sizeof(addrbuf), &map_key); printf(" %-40s %-15s %" PRIu64 "\n", addrbuf, flagbuf, (uint64_t)counter); } return 0; } int print_ips() { int map_fd4 = -1, map_fd6 = -1; char pin_root_path[PATH_MAX]; int err = 0; err = get_bpf_root_dir(pin_root_path, sizeof(pin_root_path), PROG_NAME, true); if (err) goto out; map_fd6 = get_pinned_map_fd(pin_root_path, textify(MAP_NAME_IPV6), NULL); map_fd4 = get_pinned_map_fd(pin_root_path, textify(MAP_NAME_IPV4), NULL); if (map_fd4 < 0 && map_fd6 < 0) { err = -ENOENT; goto out; } printf("Filtered IP addresses:\n"); printf(" %-40s Mode Hit counter\n", ""); if (map_fd6 >= 0) { err = __print_ips(map_fd6, AF_INET6); if (err) goto out; } if (map_fd4 >= 0) err = __print_ips(map_fd4, AF_INET); out: if (map_fd4 >= 0) close(map_fd4); if (map_fd6 >= 0) close(map_fd6); return err; } static int __do_address(const char *pin_root_path, const char *map_name, const char *feat_name, void *map_key, bool remove, int mode) { int map_fd = -1, err = 0; __u8 flags = 0; __u64 counter; map_fd = get_pinned_map_fd(pin_root_path, map_name, NULL); if (map_fd < 0) { pr_warn("Couldn't find filter map; is xdp-filter loaded " "with the %s feature?\n", feat_name); err = -ENOENT; goto out; } err = map_get_counter_flags(map_fd, map_key, &counter, &flags); if (err && err != -ENOENT) goto out; if (remove) flags &= ~mode; else flags |= mode; err = map_set_flags(map_fd, map_key, flags, true); if (err) goto out; out: return err ?: map_fd; } static const struct ipopt { unsigned int mode; struct ip_addr addr; bool print_status; bool remove; } defaults_ip = { .mode = MAP_FLAG_DST, }; static struct prog_option ip_options[] = { DEFINE_OPTION("addr", OPT_IPADDR, struct ipopt, addr, .positional = true, .metavar = "", .required = true, .help = "Address to add or remove"), DEFINE_OPTION("remove", OPT_BOOL, struct ipopt, remove, .short_opt = 'r', .help = "Remove address instead of adding"), DEFINE_OPTION("mode", OPT_FLAGS, struct ipopt, mode, .short_opt = 'm', .metavar = "", .typearg = map_flags_srcdst, .help = "Filter mode; default dst"), DEFINE_OPTION("status", OPT_BOOL, struct ipopt, print_status, .short_opt = 's', .help = "Print status of filtered addresses after changing"), END_OPTIONS }; static int do_ip(const void *cfg, const char *pin_root_path) { int map_fd = -1, err = EXIT_SUCCESS, lock_fd; char modestr[100], addrstr[100]; const struct ipopt *opt = cfg; struct ip_addr addr = opt->addr; bool v6; lock_fd = prog_lock_acquire(pin_root_path); if (lock_fd < 0) return lock_fd; print_flags(modestr, sizeof(modestr), map_flags_srcdst, opt->mode); print_addr(addrstr, sizeof(addrstr), &opt->addr); pr_debug("%s addr %s mode %s\n", opt->remove ? "Removing" : "Adding", addrstr, modestr); v6 = (opt->addr.af == AF_INET6); map_fd = __do_address(pin_root_path, v6 ? textify(MAP_NAME_IPV6) : textify(MAP_NAME_IPV4), v6 ? "ipv6" : "ipv4", &addr.addr, opt->remove, opt->mode); if (map_fd < 0) { err = map_fd; goto out; } if (opt->print_status) { err = print_ips(); if (err) goto out; } out: if (map_fd >= 0) close(map_fd); prog_lock_release(lock_fd); return err; } int print_ethers(int map_fd) { struct mac_addr map_key = {}, prev_key = {}; int err; printf("Filtered MAC addresses:\n"); printf(" %-40s Mode Hit counter\n", ""); FOR_EACH_MAP_KEY (err, map_fd, map_key, prev_key) { char modebuf[100], addrbuf[100]; __u8 flags = 0; __u64 counter; err = map_get_counter_flags(map_fd, &map_key, &counter, &flags); if (err == -ENOENT) continue; else if (err) return err; print_flags(modebuf, sizeof(modebuf), map_flags_srcdst, flags); print_macaddr(addrbuf, sizeof(addrbuf), &map_key); printf(" %-40s %-15s %" PRIu64 "\n", addrbuf, modebuf, (uint64_t)counter); } return 0; } static const struct etheropt { unsigned int mode; struct mac_addr addr; bool print_status; bool remove; } defaults_ether = { .mode = MAP_FLAG_DST, }; static struct prog_option ether_options[] = { DEFINE_OPTION("addr", OPT_MACADDR, struct etheropt, addr, .positional = true, .metavar = "", .required = true, .help = "Address to add or remove"), DEFINE_OPTION("remove", OPT_BOOL, struct etheropt, remove, .short_opt = 'r', .help = "Remove address instead of adding"), DEFINE_OPTION("mode", OPT_FLAGS, struct etheropt, mode, .short_opt = 'm', .metavar = "", .typearg = map_flags_srcdst, .help = "Filter mode; default dst"), DEFINE_OPTION("status", OPT_BOOL, struct etheropt, print_status, .short_opt = 's', .help = "Print status of filtered addresses after changing"), END_OPTIONS }; static int do_ether(const void *cfg, const char *pin_root_path) { int err = EXIT_SUCCESS, map_fd = -1, lock_fd; const struct etheropt *opt = cfg; struct mac_addr addr = opt->addr; char modestr[100], addrstr[100]; lock_fd = prog_lock_acquire(pin_root_path); if (lock_fd < 0) return lock_fd; print_flags(modestr, sizeof(modestr), map_flags_srcdst, opt->mode); print_macaddr(addrstr, sizeof(addrstr), &opt->addr); pr_debug("%s addr %s mode %s\n", opt->remove ? "Removing" : "Adding", addrstr, modestr); map_fd = __do_address(pin_root_path, textify(MAP_NAME_ETHERNET), "ethernet", &addr.addr, opt->remove, opt->mode); if (map_fd < 0) { err = map_fd; goto out; } if (opt->print_status) { err = print_ethers(map_fd); if (err) goto out; } out: if (map_fd >= 0) close(map_fd); prog_lock_release(lock_fd); return err; } static struct prog_option status_options[] = { END_OPTIONS }; int print_iface_status(const struct iface *iface, struct xdp_program *prog, enum xdp_attach_mode mode, __unused void *arg) { __u32 feat = 0; int err; printf("%s\n", xdp_program__name(prog)); err = get_iface_features(iface, prog, XDP_MODE_UNSPEC, &feat); if (err) return err; if (feat) { char featbuf[100]; char namebuf[100]; print_flags(featbuf, sizeof(featbuf), print_features, feat); snprintf(namebuf, sizeof(namebuf), "%s (%s mode)", iface->ifname, get_enum_name(xdp_modes, mode)); printf(" %-40s %s\n", namebuf, featbuf); } return 0; } int do_status(__unused const void *cfg, const char *pin_root_path) { int err = EXIT_SUCCESS, map_fd = -1, lock_fd; struct bpf_map_info info = {}; struct stats_record rec = {}; lock_fd = prog_lock_acquire(pin_root_path); if (lock_fd < 0) return lock_fd; map_fd = get_pinned_map_fd(pin_root_path, textify(XDP_STATS_MAP_NAME), &info); if (map_fd < 0) { err = map_fd; pr_warn("Couldn't find stats map. Maybe xdp-filter is not loaded?\n"); goto out; } rec.stats[XDP_DROP].enabled = true; rec.stats[XDP_PASS].enabled = true; rec.stats[XDP_ABORTED].enabled = true; err = stats_collect(map_fd, info.type, &rec); if (err) goto out; printf("CURRENT XDP-FILTER STATUS:\n\n"); printf("Aggregate per-action statistics:\n"); err = stats_print_one(&rec); if (err) goto out; printf("\n"); printf("Loaded on interfaces:\n"); printf(" %-40s Enabled features\n", ""); err = iterate_pinned_programs(pin_root_path, print_iface_status, NULL); if (err) goto out; printf("\n"); map_fd = get_pinned_map_fd(pin_root_path, textify(MAP_NAME_PORTS), NULL); if (map_fd >= 0) { err = print_ports(map_fd); if (err) goto out; printf("\n"); close(map_fd); map_fd = -1; } err = print_ips(); if (err && err != -ENOENT) goto out; printf("\n"); map_fd = get_pinned_map_fd(pin_root_path, textify(MAP_NAME_ETHERNET), NULL); if (map_fd >= 0) { err = print_ethers(map_fd); if (err) goto out; } printf("\n"); out: if (map_fd >= 0) close(map_fd); prog_lock_release(lock_fd); return err; } static const struct pollopt { __u32 interval; } defaults_poll = { .interval = 1000 }; static struct prog_option poll_options[] = { DEFINE_OPTION("interval", OPT_U32, struct pollopt, interval, .short_opt = 'i', .metavar = "", .help = "Polling interval in milliseconds (default 1000)"), END_OPTIONS }; int do_poll(const void *cfg, const char *pin_root_path) { int err = 0, map_fd = -1, lock_fd; const struct pollopt *opt = cfg; bool exit = false; if (!opt->interval) { err = -EINVAL; pr_warn("Can't use a polling interval of 0\n"); goto out; } lock_fd = prog_lock_acquire(pin_root_path); if (lock_fd < 0) return lock_fd; map_fd = get_pinned_map_fd(pin_root_path, textify(XDP_STATS_MAP_NAME), NULL); if (map_fd < 0) { err = map_fd; pr_warn("Couldn't find stats map. Maybe xdp-filter is not loaded?\n"); prog_lock_release(lock_fd); return EXIT_FAILURE; } prog_lock_release(lock_fd); err = stats_poll(map_fd, opt->interval, &exit, pin_root_path, textify(XDP_STATS_MAP_NAME)); if (err) { pr_warn("Error polling statistics: %s\n", strerror(-err)); goto out; } out: return err ? EXIT_FAILURE : EXIT_SUCCESS; } int do_help(__unused const void *cfg, __unused const char *pin_root_path) { fprintf(stderr, "Usage: xdp-filter COMMAND [options]\n" "\n" "COMMAND can be one of:\n" " load - load xdp-filter on an interface\n" " unload - unload xdp-filter from an interface\n" " port - add a port to the filter list\n" " ip - add an IP address to the filter list\n" " ether - add an Ethernet MAC address to the filter list\n" " status - show current xdp-filter status\n" " poll - poll statistics output\n" " help - show this help message\n" "\n" "Use 'xdp-filter COMMAND --help' to see options for each command\n"); return -1; } static const struct prog_command cmds[] = { DEFINE_COMMAND(load, "Load xdp-filter on an interface"), DEFINE_COMMAND(unload, "Unload xdp-filter from an interface"), DEFINE_COMMAND(port, "Add or remove ports from xdp-filter"), DEFINE_COMMAND(ip, "Add or remove IP addresses from xdp-filter"), DEFINE_COMMAND(ether, "Add or remove MAC addresses from xdp-filter"), DEFINE_COMMAND(poll, "Poll xdp-filter statistics"), DEFINE_COMMAND_NODEF(status, "Show xdp-filter status"), { .name = "help", .func = do_help, .no_cfg = true }, END_COMMANDS }; union all_opts { struct loadopt load; struct unloadopt unload; struct portopt port; struct ipopt ip; struct etheropt ether; struct pollopt poll; }; int main(int argc, char **argv) { if (argc > 1) return dispatch_commands(argv[1], argc - 1, argv + 1, cmds, sizeof(union all_opts), PROG_NAME, true); return do_help(NULL, NULL); } xdp-tools-1.6.1/xdp-filter/xdpfilt_alw_all.c000066400000000000000000000003601514310632100210320ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define FILT_MODE_ALLOW #define FILT_MODE_ETHERNET #define FILT_MODE_IPV4 #define FILT_MODE_IPV6 #define FILT_MODE_UDP #define FILT_MODE_TCP #define FUNCNAME xdpfilt_alw_all #include "xdpfilt_prog.h" xdp-tools-1.6.1/xdp-filter/xdpfilt_alw_eth.c000066400000000000000000000003541514310632100210450ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define FILT_MODE_ALLOW #define FILT_MODE_ETHERNET #undef FILT_MODE_IPV4 #undef FILT_MODE_IPV6 #undef FILT_MODE_UDP #undef FILT_MODE_TCP #define FUNCNAME xdpfilt_alw_eth #include "xdpfilt_prog.h" xdp-tools-1.6.1/xdp-filter/xdpfilt_alw_ip.c000066400000000000000000000003541514310632100206750ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define FILT_MODE_ALLOW #undef FILT_MODE_ETHERNET #define FILT_MODE_IPV4 #define FILT_MODE_IPV6 #undef FILT_MODE_UDP #undef FILT_MODE_TCP #define FUNCNAME xdpfilt_alw_ip #include "xdpfilt_prog.h" xdp-tools-1.6.1/xdp-filter/xdpfilt_alw_tcp.c000066400000000000000000000003541514310632100210530ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define FILT_MODE_ALLOW #undef FILT_MODE_ETHERNET #undef FILT_MODE_IPV4 #undef FILT_MODE_IPV6 #undef FILT_MODE_UDP #define FILT_MODE_TCP #define FUNCNAME xdpfilt_alw_tcp #include "xdpfilt_prog.h" xdp-tools-1.6.1/xdp-filter/xdpfilt_alw_udp.c000066400000000000000000000003541514310632100210550ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define FILT_MODE_ALLOW #undef FILT_MODE_ETHERNET #undef FILT_MODE_IPV4 #undef FILT_MODE_IPV6 #define FILT_MODE_UDP #undef FILT_MODE_TCP #define FUNCNAME xdpfilt_alw_udp #include "xdpfilt_prog.h" xdp-tools-1.6.1/xdp-filter/xdpfilt_dny_all.c000066400000000000000000000003571514310632100210470ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define FILT_MODE_DENY #define FILT_MODE_ETHERNET #define FILT_MODE_IPV4 #define FILT_MODE_IPV6 #define FILT_MODE_UDP #define FILT_MODE_TCP #define FUNCNAME xdpfilt_dny_all #include "xdpfilt_prog.h" xdp-tools-1.6.1/xdp-filter/xdpfilt_dny_eth.c000066400000000000000000000003531514310632100210530ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define FILT_MODE_DENY #define FILT_MODE_ETHERNET #undef FILT_MODE_IPV4 #undef FILT_MODE_IPV6 #undef FILT_MODE_UDP #undef FILT_MODE_TCP #define FUNCNAME xdpfilt_dny_eth #include "xdpfilt_prog.h" xdp-tools-1.6.1/xdp-filter/xdpfilt_dny_ip.c000066400000000000000000000003531514310632100207030ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define FILT_MODE_DENY #undef FILT_MODE_ETHERNET #define FILT_MODE_IPV4 #define FILT_MODE_IPV6 #undef FILT_MODE_UDP #undef FILT_MODE_TCP #define FUNCNAME xdpfilt_dny_ip #include "xdpfilt_prog.h" xdp-tools-1.6.1/xdp-filter/xdpfilt_dny_tcp.c000066400000000000000000000003531514310632100210610ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define FILT_MODE_DENY #undef FILT_MODE_ETHERNET #undef FILT_MODE_IPV4 #undef FILT_MODE_IPV6 #undef FILT_MODE_UDP #define FILT_MODE_TCP #define FUNCNAME xdpfilt_dny_tcp #include "xdpfilt_prog.h" xdp-tools-1.6.1/xdp-filter/xdpfilt_dny_udp.c000066400000000000000000000003531514310632100210630ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define FILT_MODE_DENY #undef FILT_MODE_ETHERNET #undef FILT_MODE_IPV4 #undef FILT_MODE_IPV6 #define FILT_MODE_UDP #undef FILT_MODE_TCP #define FUNCNAME xdpfilt_dny_udp #include "xdpfilt_prog.h" xdp-tools-1.6.1/xdp-filter/xdpfilt_prog.h000066400000000000000000000204221514310632100203740ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ /* XDP filter program fragment. This header file contains the full-featured * program, split up with ifdefs. The actual program files xdpfilt_*.c * include this file with different #defines to create the * different eBPF program sections that include only the needed features. */ #ifndef __XDPFILT_PROG_H #define __XDPFILT_PROG_H #include #include #include #include #define NDISC_NEIGHBOUR_SOLICITATION 135 #define NDISC_NEIGHBOUR_ADVERTISEMENT 136 #include "common_kern_user.h" /* Defines xdp_stats_map */ #include "xdp/xdp_stats_kern.h" #include "xdp/parsing_helpers.h" #ifdef FILT_MODE_DENY #define VERDICT_HIT XDP_PASS #define VERDICT_MISS XDP_DROP #define FEATURE_OPMODE FEAT_DENY #else #define VERDICT_HIT XDP_DROP #define VERDICT_MISS XDP_PASS #define FEATURE_OPMODE FEAT_ALLOW #endif #define CHECK_RET(ret) \ do { \ if ((ret) < 0) { \ action = XDP_ABORTED; \ goto out; \ } \ } while (0) #define CHECK_VERDICT(type, param) \ do { \ if ((action = lookup_verdict_##type(param)) != VERDICT_MISS) \ goto out; \ } while (0) #define CHECK_VERDICT_2(type, param1, param2) \ do { \ if ((action = lookup_verdict_##type(param1, param2)) != VERDICT_MISS) \ goto out; \ } while (0) #define CHECK_MAP(map, key, mask) \ do { \ __u64 *value; \ value = bpf_map_lookup_elem(map, key); \ if ((value) && (*(value) & (mask)) == (mask)) { \ *value += (1 << COUNTER_SHIFT); \ return VERDICT_HIT; \ } \ } while (0) #if defined(FILT_MODE_TCP) || defined(FILT_MODE_UDP) struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __uint(max_entries, 65536); __type(key, __u32); __type(value, __u64); __uint(pinning, LIBBPF_PIN_BY_NAME); } MAP_NAME_PORTS SEC(".maps"); #ifdef FILT_MODE_TCP static int __always_inline lookup_verdict_tcp(struct tcphdr *tcphdr) { __u32 key; key = tcphdr->dest; CHECK_MAP(&filter_ports, &key, MAP_FLAG_DST | MAP_FLAG_TCP); key = tcphdr->source; CHECK_MAP(&filter_ports, &key, MAP_FLAG_SRC | MAP_FLAG_TCP); return VERDICT_MISS; } #define FEATURE_TCP FEAT_TCP #else #define FEATURE_TCP 0 #endif #ifdef FILT_MODE_UDP static int __always_inline lookup_verdict_udp(struct udphdr *udphdr) { __u32 key; key = udphdr->dest; CHECK_MAP(&filter_ports, &key, MAP_FLAG_DST | MAP_FLAG_UDP); key = udphdr->source; CHECK_MAP(&filter_ports, &key, MAP_FLAG_SRC | MAP_FLAG_UDP); return VERDICT_MISS; } #define FEATURE_UDP FEAT_UDP #else #define FEATURE_UDP 0 #endif #else #define FEATURE_UDP 0 #define FEATURE_TCP 0 #endif /* TCP || UDP */ #ifdef FILT_MODE_IPV4 struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); __uint(max_entries, 10000); __type(key, __u32); __type(value, __u64); __uint(pinning, LIBBPF_PIN_BY_NAME); } MAP_NAME_IPV4 SEC(".maps"); static int __always_inline lookup_verdict_ipv4(__u32 *src_addr, __u32 *dst_addr) { __u32 addr; if (dst_addr) { addr = *dst_addr; CHECK_MAP(&filter_ipv4, &addr, MAP_FLAG_DST); } if (src_addr) { addr = *src_addr; CHECK_MAP(&filter_ipv4, &addr, MAP_FLAG_SRC); } return VERDICT_MISS; } #define CHECK_VERDICT_IPV4(src, dst) CHECK_VERDICT_2(ipv4, src, dst) #define FEATURE_IPV4 FEAT_IPV4 #else #define FEATURE_IPV4 0 #define CHECK_VERDICT_IPV4(src, dst) #endif /* FILT_MODE_IPV4 */ #ifdef FILT_MODE_IPV6 struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); __uint(max_entries, 10000); __type(key, struct in6_addr); __type(value, __u64); __uint(pinning, LIBBPF_PIN_BY_NAME); } MAP_NAME_IPV6 SEC(".maps"); static int __always_inline lookup_verdict_ipv6(struct in6_addr *src_addr, struct in6_addr *dst_addr) { struct in6_addr addr; if (dst_addr) { addr = *dst_addr; CHECK_MAP(&filter_ipv6, &addr, MAP_FLAG_DST); } if (src_addr) { addr = *src_addr; CHECK_MAP(&filter_ipv6, &addr, MAP_FLAG_SRC); } return VERDICT_MISS; } #define CHECK_VERDICT_IPV6(src, dst) CHECK_VERDICT_2(ipv6, src, dst) #define FEATURE_IPV6 FEAT_IPV6 #else #define FEATURE_IPV6 0 #define CHECK_VERDICT_IPV6(src, dst) #endif /* FILT_MODE_IPV6 */ #ifdef FILT_MODE_ETHERNET struct ethaddr { __u8 addr[ETH_ALEN]; }; struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); __uint(max_entries, 10000); __type(key, struct ethaddr); __type(value, __u64); __uint(pinning, LIBBPF_PIN_BY_NAME); } MAP_NAME_ETHERNET SEC(".maps"); static int __always_inline lookup_verdict_ethernet(struct ethhdr *eth) { struct ethaddr addr = {}; __builtin_memcpy(&addr, eth->h_dest, sizeof(addr)); CHECK_MAP(&filter_ethernet, &addr, MAP_FLAG_DST); __builtin_memcpy(&addr, eth->h_source, sizeof(addr)); CHECK_MAP(&filter_ethernet, &addr, MAP_FLAG_SRC); return VERDICT_MISS; } #define CHECK_VERDICT_ETHERNET(param) CHECK_VERDICT(ethernet, param) #define FEATURE_ETHERNET FEAT_ETHERNET #else #define FEATURE_ETHERNET 0 #define CHECK_VERDICT_ETHERNET(param) #endif /* FILT_MODE_ETHERNET */ #ifndef FUNCNAME #define FUNCNAME xdp_filt_unknown #endif struct { __uint(priority, 10); __uint(XDP_PASS, 1); } XDP_RUN_CONFIG(FUNCNAME); SEC("xdp") int FUNCNAME(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; __u32 action = VERDICT_MISS; /* Default action */ struct hdr_cursor nh; struct ethhdr *eth; int eth_type; nh.pos = data; eth_type = parse_ethhdr(&nh, data_end, ð); CHECK_RET(eth_type); CHECK_VERDICT_ETHERNET(eth); #if defined(FILT_MODE_IPV4) || defined(FILT_MODE_IPV6) || \ defined(FILT_MODE_TCP) || defined(FILT_MODE_UDP) struct icmp6hdr *icmp6hdr; struct ipv6hdr *ipv6hdr; struct iphdr *iphdr; int ip_type = 0, nh_op; if (eth_type == bpf_htons(ETH_P_IP)) { ip_type = parse_iphdr(&nh, data_end, &iphdr); CHECK_RET(ip_type); CHECK_VERDICT_IPV4(&iphdr->saddr, &iphdr->daddr); #if defined(FILT_MODE_IPV4) } else if (eth_type == bpf_htons(ETH_P_ARP)) { struct arphdr *arphdr; __be32 sip, tip; nh_op = parse_arphdr(&nh, data_end, &arphdr); CHECK_RET(nh_op); sip = arphdr->ar_sip; tip = arphdr->ar_tip; /* Always check the verdict of the ARP sender */ CHECK_VERDICT_IPV4(&sip, NULL); if (nh_op == bpf_htons(ARPOP_REQUEST)) { /* Someone wants to talk to TARGET, so target is a DST IP */ CHECK_VERDICT_IPV4(NULL, &tip); } else if (nh_op == bpf_htons(ARPOP_REPLY)) { /* Someone has addr TARGET, so target is a SRC IP */ CHECK_VERDICT_IPV4(&tip, NULL); } #endif } else if (eth_type == bpf_htons(ETH_P_IPV6)) { ip_type = parse_ip6hdr(&nh, data_end, &ipv6hdr); CHECK_RET(ip_type); CHECK_VERDICT_IPV6(&ipv6hdr->saddr, &ipv6hdr->daddr); if (ip_type == IPPROTO_ICMPV6) { nh_op = parse_icmp6hdr(&nh, data_end, &icmp6hdr); CHECK_RET(nh_op); if (nh_op == NDISC_NEIGHBOUR_SOLICITATION || nh_op == NDISC_NEIGHBOUR_ADVERTISEMENT) { struct in6_addr *addr = nh.pos; if (addr + 1 > data_end) { action = XDP_ABORTED; goto out; } if (nh_op == NDISC_NEIGHBOUR_SOLICITATION) /* Someone wants to talk to TARGET, so target is a DST IP */ CHECK_VERDICT_IPV6(NULL, addr); else /* Someone has addr TARGET, so target is a SRC IP */ CHECK_VERDICT_IPV6(addr, NULL); } } } else { goto out; } #ifdef FILT_MODE_UDP struct udphdr *udphdr; if (ip_type == IPPROTO_UDP) { CHECK_RET(parse_udphdr(&nh, data_end, &udphdr)); CHECK_VERDICT(udp, udphdr); } #endif /* FILT_MODE_UDP */ #ifdef FILT_MODE_TCP struct tcphdr *tcphdr; if (ip_type == IPPROTO_TCP) { CHECK_RET(parse_tcphdr(&nh, data_end, &tcphdr)); CHECK_VERDICT(tcp, tcphdr); } #endif /* FILT_MODE_TCP*/ #endif /* FILT_MODE_{IPV4,IPV6,TCP,UDP} */ out: return xdp_stats_record_action(ctx, action); } char _license[] SEC("license") = "GPL"; __u32 _features SEC("features") = (FEATURE_ETHERNET | FEATURE_IPV4 | FEATURE_IPV6 | FEATURE_UDP | FEATURE_TCP | FEATURE_OPMODE); #else #error "Multiple includes of xdpfilt_prog.h" #endif // include guard xdp-tools-1.6.1/xdp-forward/000077500000000000000000000000001514310632100157015ustar00rootroot00000000000000xdp-tools-1.6.1/xdp-forward/.gitignore000066400000000000000000000000141514310632100176640ustar00rootroot00000000000000xdp-forward xdp-tools-1.6.1/xdp-forward/Makefile000066400000000000000000000005151514310632100173420ustar00rootroot00000000000000# SPDX-License-Identifier: GPL-2.0 XDP_TARGETS := xdp_forward.bpf xdp_flowtable.bpf xdp_flowtable_sample.bpf BPF_SKEL_TARGETS := $(XDP_TARGETS) XDP_OBJ_INSTALL := TOOL_NAME := xdp-forward MAN_PAGE := xdp-forward.8 TEST_FILE := tests/test-xdp-forward.sh USER_TARGETS := xdp-forward LIB_DIR := ../lib include $(LIB_DIR)/common.mk xdp-tools-1.6.1/xdp-forward/README.org000066400000000000000000000160011514310632100173450ustar00rootroot00000000000000#+EXPORT_FILE_NAME: xdp-forward #+TITLE: xdp-forward #+OPTIONS: ^:nil #+MAN_CLASS_OPTIONS: :section-id "8\" \"DATE\" \"VERSION\" \"XDP program loader" # This file serves both as a README on github, and as the source for the man # page; the latter through the org-mode man page export support. # . # To export the man page, simply use the org-mode exporter; (require 'ox-man) if # it's not available. There's also a Makefile rule to export it. * xdp-forward - the XDP forwarding plane xdp-forward is an XDP forwarding plane, which will accelerate packet forwarding using XDP. To use it, simply load it on the set of interfaces to accelerate forwarding between. The userspace component of xdp-forward will then configure and load XDP programs on those interfaces, and forward packets between them using XDP_REDIRECT, using the kernel routing table or netfilter flowtable to determine the destination for each packet. Any packets that xdp-forward does not know how to forward will be passed up to the networking stack and handled by the kernel like normal. Depending on the mode xdp-forward is loaded in, this leads to different forwarding behaviours. See the sectinon on *Operating modes* below. ** Running xdp-forward The syntax for running xdp-forward is: #+begin_src sh xdp-forward COMMAND [options] Where COMMAND can be one of: load - Load the XDP forwarding plane unload - Unload the XDP forwarding plane help - show the list of available commands #+end_src Each command, and its options are explained below. Or use =xdp-forward COMMAND --help= to see the options for each command. * The LOAD command The =load= command loads the XDP forwarding plane on a list of interfaces. The syntax for the =load= command is: =xdp-forward load [options] = Where == is the name of the set of interfaces to forward packets between. An XDP program will be loaded on each interface, configured to forward packets to all other interfaces in the set (using the kernel routing table to determine the destination interface of each packet). The supported options are: ** -f, --fwd-mode Specifies which forwarding mode =xdp-forward= should operate in. Depending on the mode selected, =xdp-forward= will perform forwarding in different ways, which can lead to different behaviour, including which subset of kernel configuration (such as firewall rules) is respected during forwarding. See the section *FORWARDING MODES* below for a full description of each mode. ** -F, --fib-mode Specifies how =xdp-forward= performs routing table lookup in the linux kernel. See the section *FIB MODES* below for a full description of each mode. ** -m, --mode Specifies which mode to load the XDP program to be loaded in. The valid values are 'native', which is the default in-driver XDP mode, 'skb', which causes the so-called /skb mode/ (also known as /generic XDP/) to be used, 'hw' which causes the program to be offloaded to the hardware, or 'unspecified' which leaves it up to the kernel to pick a mode (which it will do by picking native mode if the driver supports it, or generic mode otherwise). Note that using 'unspecified' can make it difficult to predict what mode a program will end up being loaded in. For this reason, the default is 'native'. Note that hardware with support for the 'hw' mode is rare: Solarflare cards (using the 'sfc' driver) are the only devices with support for this in the mainline Linux kernel. ** -v, --verbose Enable debug logging. Specify twice for even more verbosity. ** -h, --help Display a summary of the available options * The UNLOAD command The =unload= command is used for unloading programs from an interface. The syntax for the =unload= command is: =xdp-forward unload [options] = Where == is the list of interfaces to unload the XDP forwarding plane from. Note that while =xdp-forward= will examine the XDP programs loaded on each interface and make sure to only unload its own program, it will not check that the list of supplied interfaces is the same as the one supplied during load. As such, it is possible to perform a partial unload by supplying a different list of interfaces, which may lead to unexpected behaviour. The supported options are: ** -v, --verbose Enable debug logging. Specify twice for even more verbosity. ** -h, --help Display a summary of the available options * FORWARDING MODES The =xdp-forward= utility supports the following forwarding modes (selected by the =--fwd-mode= parameter to =xdp-forward load=. ** fib (default) In the =fib= forwarding mode, =xdp-forward= will perform a lookup in the kernel routing table (or FIB) for each packet, and forward packets between the configured interfaces based on the result of the lookup. Any packet where the lookup fails will be passed up to the stack. This includes packets that require neighbour discovery for the next hop, meaning that packets will periodically pass up the kernel stack for next hop discovery (initially, and when the nexthop entry expires). Note that no checks other than the FIB lookup is performed; in particular, this completely bypasses the netfilter subsystem, so firewall rules will not be checked before forwarding. ** flowtable The =flowtable= operating mode offloads netfilter sw flowtable logic in the XDP layer if the hardware flowtable is not available. At the moment =xdp-forward= is able to offload just TCP or UDP netfilter flowtable entries to XDP. The user is supposed to configure the flowtable separately. * FIB MODES The =xdp-forward= utility supports the following fib modes (selected by the =--fib-mode= parameter to =xdp-forward load=. ** full (default) In the =full= operating mode, =xdp-forward= will perform a full lookup in the kernel routing table (or FIB) for each packet, and forward packets between the configured interfaces based on the result of the lookup. In particular, it will apply any policy routing rules configured by the user. ** direct The =direct= mode functions like =full=, except it passes the =BPF_FIB_LOOKUP_DIRECT= flag to the FIB lookup routine. This means that any policy routing rules configured will be skipped during the lookup, which can improve performance (but won't obey the policy of those rules, obviously). * Examples In order to enable flowtable offloading for tcp and udp traffic between NICs n0 and n1, issue the following commands: #+begin_src sh #nft -f /dev/stdin < in pre-routing chain chain prerouting { type nat hook prerouting priority filter; policy accept; iifname == "${NS_NAMES[0]}" meta nfproto ipv4 tcp dport 12345 dnat ip to ${ALL_INSIDE_IP4[-1]}:10000 iifname == "${NS_NAMES[0]}" meta nfproto ipv6 tcp dport 12345 dnat ip6 to [${ALL_INSIDE_IP6[-1]}]:10000 } # enable SNAT of the client ip via masquerading in post-routing chain chain postrouting { type nat hook postrouting priority filter; policy accept; oifname "${NS_NAMES[-1]}" masquerade } } table inet filter { flowtable ft { hook ingress priority filter devices = { ${NS_NAMES[0]}, ${NS_NAMES[-1]} } } chain forward { type filter hook forward priority filter meta l4proto { tcp } flow add @ft } } EOF # check if bpf flowtable lookup is available skip_if_missing_kernel_symbol bpf_xdp_flow_lookup # Add some nft rules to check {dnat/snat} is done properly in # the main namespace check_run ip netns exec ${NS_NAMES[-1]} nft -f /dev/stdin </dev/null 2>&1 { $XDP_FORWARD unload ${NS_NAMES[@]} $XDP_LOADER unload $NS --all check_run ip netns exec ${NS_NAMES[-1]} nft flush ruleset check_run nft flush ruleset ip link del dev veth-forw-test } >/dev/null 2>&1 } xdp-tools-1.6.1/xdp-forward/xdp-forward.8000066400000000000000000000157611514310632100202410ustar00rootroot00000000000000.TH "xdp-forward" "8" "OCTOBER 11, 2024" "V1.6.1" "XDP program loader" .SH "NAME" xdp-forward \- the XDP forwarding plane .SH "SYNOPSIS" .PP xdp-forward is an XDP forwarding plane, which will accelerate packet forwarding using XDP. To use it, simply load it on the set of interfaces to accelerate forwarding between. The userspace component of xdp-forward will then configure and load XDP programs on those interfaces, and forward packets between them using XDP_REDIRECT, using the kernel routing table or netfilter flowtable to determine the destination for each packet. .PP Any packets that xdp-forward does not know how to forward will be passed up to the networking stack and handled by the kernel like normal. Depending on the mode xdp-forward is loaded in, this leads to different forwarding behaviours. See the sectinon on \fBOperating modes\fP below. .SS "Running xdp-forward" .PP The syntax for running xdp-forward is: .RS .nf \fCxdp-forward COMMAND [options] Where COMMAND can be one of: load - Load the XDP forwarding plane unload - Unload the XDP forwarding plane help - show the list of available commands \fP .fi .RE .PP Each command, and its options are explained below. Or use \fIxdp\-forward COMMAND \-\-help\fP to see the options for each command. .SH "The LOAD command" .PP The \fIload\fP command loads the XDP forwarding plane on a list of interfaces. .PP The syntax for the \fIload\fP command is: .PP \fIxdp\-forward load [options] \fP .PP Where \fI\fP is the name of the set of interfaces to forward packets between. An XDP program will be loaded on each interface, configured to forward packets to all other interfaces in the set (using the kernel routing table to determine the destination interface of each packet). .PP The supported options are: .SS "-f, --fwd-mode " .PP Specifies which forwarding mode \fIxdp\-forward\fP should operate in. Depending on the mode selected, \fIxdp\-forward\fP will perform forwarding in different ways, which can lead to different behaviour, including which subset of kernel configuration (such as firewall rules) is respected during forwarding. See the section \fBFORWARDING MODES\fP below for a full description of each mode. .SS "-F, --fib-mode " .PP Specifies how \fIxdp\-forward\fP performs routing table lookup in the linux kernel. See the section \fBFIB MODES\fP below for a full description of each mode. .SS "-m, --mode " .PP Specifies which mode to load the XDP program to be loaded in. The valid values are 'native', which is the default in-driver XDP mode, 'skb', which causes the so-called \fIskb mode\fP (also known as \fIgeneric XDP\fP) to be used, 'hw' which causes the program to be offloaded to the hardware, or 'unspecified' which leaves it up to the kernel to pick a mode (which it will do by picking native mode if the driver supports it, or generic mode otherwise). Note that using 'unspecified' can make it difficult to predict what mode a program will end up being loaded in. For this reason, the default is 'native'. Note that hardware with support for the 'hw' mode is rare: Solarflare cards (using the 'sfc' driver) are the only devices with support for this in the mainline Linux kernel. .SS "-v, --verbose" .PP Enable debug logging. Specify twice for even more verbosity. .SS "-h, --help" .PP Display a summary of the available options .SH "The UNLOAD command" .PP The \fIunload\fP command is used for unloading programs from an interface. .PP The syntax for the \fIunload\fP command is: .PP \fIxdp\-forward unload [options] \fP .PP Where \fI\fP is the list of interfaces to unload the XDP forwarding plane from. Note that while \fIxdp\-forward\fP will examine the XDP programs loaded on each interface and make sure to only unload its own program, it will not check that the list of supplied interfaces is the same as the one supplied during load. As such, it is possible to perform a partial unload by supplying a different list of interfaces, which may lead to unexpected behaviour. .PP The supported options are: .SS "-v, --verbose" .PP Enable debug logging. Specify twice for even more verbosity. .SS "-h, --help" .PP Display a summary of the available options .SH "FORWARDING MODES" .PP The \fIxdp\-forward\fP utility supports the following forwarding modes (selected by the \fI\-\-fwd\-mode\fP parameter to \fIxdp\-forward load\fP. .SS "fib (default)" .PP In the \fIfib\fP forwarding mode, \fIxdp\-forward\fP will perform a lookup in the kernel routing table (or FIB) for each packet, and forward packets between the configured interfaces based on the result of the lookup. Any packet where the lookup fails will be passed up to the stack. This includes packets that require neighbour discovery for the next hop, meaning that packets will periodically pass up the kernel stack for next hop discovery (initially, and when the nexthop entry expires). .PP Note that no checks other than the FIB lookup is performed; in particular, this completely bypasses the netfilter subsystem, so firewall rules will not be checked before forwarding. .SS "flowtable" .PP The \fIflowtable\fP operating mode offloads netfilter sw flowtable logic in the XDP layer if the hardware flowtable is not available. At the moment \fIxdp\-forward\fP is able to offload just TCP or UDP netfilter flowtable entries to XDP. The user is supposed to configure the flowtable separately. .SH "FIB MODES" .PP The \fIxdp\-forward\fP utility supports the following fib modes (selected by the \fI\-\-fib\-mode\fP parameter to \fIxdp\-forward load\fP. .SS "full (default)" .PP In the \fIfull\fP operating mode, \fIxdp\-forward\fP will perform a full lookup in the kernel routing table (or FIB) for each packet, and forward packets between the configured interfaces based on the result of the lookup. In particular, it will apply any policy routing rules configured by the user. .SS "direct" .PP The \fIdirect\fP mode functions like \fIfull\fP, except it passes the \fIBPF_FIB_LOOKUP_DIRECT\fP flag to the FIB lookup routine. This means that any policy routing rules configured will be skipped during the lookup, which can improve performance (but won't obey the policy of those rules, obviously). .SH "Examples" .PP In order to enable flowtable offloading for tcp and udp traffic between NICs n0 and n1, issue the following commands: .RS .nf \fC#nft -f /dev/stdin < #include #include #include #include "params.h" #include "util.h" #include "logging.h" #include "compat.h" #include "xdp_forward.skel.h" #include "xdp_flowtable.skel.h" #include "xdp_flowtable_sample.skel.h" #define MAX_IFACE_NUM 32 #define PROG_NAME "xdp-forward" int do_help(__unused const void *cfg, __unused const char *pin_root_path) { fprintf(stderr, "Usage: xdp-forward COMMAND [options]\n" "\n" "COMMAND can be one of:\n" " load - Load the XDP forwarding plane\n" " unload - Unload the XDP forwarding plane\n" " help - show this help message\n" "\n" "Use 'xdp-forward COMMAND --help' to see options for each command\n"); return -1; } struct enum_val xdp_modes[] = { { "native", XDP_MODE_NATIVE }, { "skb", XDP_MODE_SKB }, { NULL, 0 } }; enum fwd_mode { FWD_FIB, FWD_FLOWTABLE, }; struct enum_val fwd_modes[] = { { "fib", FWD_FIB }, { "flowtable", FWD_FLOWTABLE }, { NULL, 0 } }; enum fib_mode { FIB_DIRECT, FIB_FULL, }; struct enum_val fib_modes[] = { { "direct", FIB_DIRECT }, { "full", FIB_FULL }, { NULL, 0 } }; static int find_prog(struct iface *iface, bool detach) { struct xdp_program *prog = NULL; enum xdp_attach_mode mode; struct xdp_multiprog *mp; int ret = -ENOENT; mp = xdp_multiprog__get_from_ifindex(iface->ifindex); if (!mp) return ret; if (xdp_multiprog__is_legacy(mp)) { prog = xdp_multiprog__main_prog(mp); goto check; } while ((prog = xdp_multiprog__next_prog(prog, mp))) { check: if (!strcmp(xdp_program__name(prog), "xdp_fwd_fib_full") || !strcmp(xdp_program__name(prog), "xdp_fwd_fib_direct") || !strcmp(xdp_program__name(prog), "xdp_fwd_flow_full") || !strcmp(xdp_program__name(prog), "xdp_fwd_flow_direct")) { mode = xdp_multiprog__attach_mode(mp); ret = 0; if (detach) { ret = xdp_program__detach(prog, iface->ifindex, mode, 0); if (ret) pr_warn("Couldn't detach XDP program from interface %s: %s\n", iface->ifname, strerror(errno)); break; } } } xdp_multiprog__close(mp); return ret; } struct load_opts { enum fwd_mode fwd_mode; enum fib_mode fib_mode; enum xdp_attach_mode xdp_mode; struct iface *ifaces; } defaults_load = { .fwd_mode = FWD_FIB, .fib_mode = FIB_FULL, }; struct prog_option load_options[] = { DEFINE_OPTION("fwd-mode", OPT_ENUM, struct load_opts, fwd_mode, .short_opt = 'f', .typearg = fwd_modes, .metavar = "", .help = "Forward mode to run in; see man page. Default fib"), DEFINE_OPTION("fib-mode", OPT_ENUM, struct load_opts, fib_mode, .short_opt = 'F', .typearg = fib_modes, .metavar = "", .help = "Fib mode to run in; see man page. Default full"), DEFINE_OPTION("xdp-mode", OPT_ENUM, struct load_opts, xdp_mode, .short_opt = 'm', .typearg = xdp_modes, .metavar = "", .help = "Load XDP program in ; default native"), DEFINE_OPTION("devs", OPT_IFNAME_MULTI, struct load_opts, ifaces, .positional = true, .metavar = "", .min_num = 1, .max_num = MAX_IFACE_NUM, .required = 1, .help = "Redirect from and to devices "), END_OPTIONS }; static bool sample_probe_bpf_xdp_flow_lookup(void) { struct xdp_flowtable_sample *skel; bool res; skel = xdp_flowtable_sample__open_and_load(); res = !!skel; xdp_flowtable_sample__destroy(skel); return res; } static int do_load(const void *cfg, __unused const char *pin_root_path) { DECLARE_LIBBPF_OPTS(xdp_program_opts, opts); struct xdp_program *xdp_prog = NULL; const struct load_opts *opt = cfg; struct bpf_program *prog = NULL; struct bpf_map *map = NULL; struct bpf_object *obj; int ret = EXIT_FAILURE; struct iface *iface; void *skel; switch (opt->fwd_mode) { case FWD_FIB: opts.prog_name = opt->fib_mode == FIB_DIRECT ? "xdp_fwd_fib_direct" : "xdp_fwd_fib_full"; break; case FWD_FLOWTABLE: opts.prog_name = opt->fib_mode == FIB_DIRECT ? "xdp_fwd_flow_direct" : "xdp_fwd_flow_full"; break; default: goto end; } if (opt->fwd_mode == FWD_FLOWTABLE) { struct xdp_flowtable *xdp_flowtable_skel; if (!sample_probe_bpf_xdp_flow_lookup()) { pr_warn("The kernel does not support the bpf_xdp_flow_lookup() kfunc\n"); goto end; } xdp_flowtable_skel = xdp_flowtable__open(); if (!xdp_flowtable_skel) { pr_warn("Failed to load skeleton: %s\n", strerror(errno)); goto end; } map = xdp_flowtable_skel->maps.xdp_tx_ports; obj = xdp_flowtable_skel->obj; skel = (void *)xdp_flowtable_skel; } else { struct xdp_forward *xdp_forward_skel = xdp_forward__open(); if (!xdp_forward_skel) { pr_warn("Failed to load skeleton: %s\n", strerror(errno)); goto end; } map = xdp_forward_skel->maps.xdp_tx_ports; obj = xdp_forward_skel->obj; skel = (void *)xdp_forward_skel; } /* Make sure we only load the one XDP program we are interested in */ while ((prog = bpf_object__next_program(obj, prog)) != NULL) if (bpf_program__type(prog) == BPF_PROG_TYPE_XDP && bpf_program__expected_attach_type(prog) == BPF_XDP) bpf_program__set_autoload(prog, false); opts.obj = obj; xdp_prog = xdp_program__create(&opts); if (!xdp_prog) { pr_warn("Couldn't open XDP program: %s\n", strerror(errno)); goto end_destroy; } /* We always set the frags support bit: nothing the program does is * incompatible with multibuf, and it's perfectly fine to load a program * with frags support on an interface with a small MTU. We don't risk * setting any flags the kernel will balk at, either, since libxdp will * do the feature probing for us and skip the flag if the kernel doesn't * support it. * * The function below returns EOPNOTSUPP it libbpf is too old to support * setting the flags, but we just ignore that, since in such a case the * best we can do is just attempt to run without the frags support. */ xdp_program__set_xdp_frags_support(xdp_prog, true); for (iface = opt->ifaces; iface; iface = iface->next) { if (find_prog(iface, false) != -ENOENT) { pr_warn("Already attached to %s, not reattaching\n", iface->ifname); continue; } ret = xdp_program__attach(xdp_prog, iface->ifindex, opt->xdp_mode, 0); if (ret) { pr_warn("Failed to attach XDP program to iface %s: %s\n", iface->ifname, strerror(-ret)); goto end_detach; } ret = bpf_map_update_elem(bpf_map__fd(map), &iface->ifindex, &iface->ifindex, 0); if (ret) { pr_warn("Failed to update devmap value: %s\n", strerror(errno)); goto end_detach; } pr_info("Loaded on interface %s\n", iface->ifname); } ret = EXIT_SUCCESS; end_destroy: if (opt->fwd_mode == FWD_FLOWTABLE) xdp_flowtable__destroy(skel); else xdp_forward__destroy(skel); end: return ret; end_detach: ret = EXIT_FAILURE; for (iface = opt->ifaces; iface; iface = iface->next) xdp_program__detach(xdp_prog, iface->ifindex, opt->xdp_mode, 0); goto end_destroy; } struct unload_opts { struct iface *ifaces; } defaults_unload = {}; struct prog_option unload_options[] = { DEFINE_OPTION("devs", OPT_IFNAME_MULTI, struct unload_opts, ifaces, .positional = true, .metavar = "", .min_num = 1, .max_num = MAX_IFACE_NUM, .help = "Redirect from and to devices "), END_OPTIONS }; static int do_unload(const void *cfg, __unused const char *pin_root_path) { const struct unload_opts *opt = cfg; int ret = EXIT_SUCCESS; struct iface *iface; for (iface = opt->ifaces; iface; iface = iface->next) { if (find_prog(iface, true)) { pr_warn("Couldn't find program on interface %s\n", iface->ifname); ret = EXIT_FAILURE; } pr_info("Unloaded from interface %s\n", iface->ifname); } return ret; } static const struct prog_command cmds[] = { DEFINE_COMMAND(load, "Load XDP forwarding plane"), DEFINE_COMMAND(unload, "Unload XDP forwarding plane"), { .name = "help", .func = do_help, .no_cfg = true }, END_COMMANDS }; union all_opts { struct load_opts load; struct unload_opts unload; }; int main(int argc, char **argv) { if (argc > 1) return dispatch_commands(argv[1], argc - 1, argv + 1, cmds, sizeof(union all_opts), PROG_NAME, false); return do_help(NULL, NULL); } xdp-tools-1.6.1/xdp-forward/xdp_flowtable.bpf.c000066400000000000000000000366011514310632100214530ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 /* Original xdp_fwd sample Copyright (c) 2017-18 David Ahern */ #include #include #include #include #define AF_INET 2 #define AF_INET6 10 #define IPV6_FLOWINFO_MASK bpf_htons(0x0FFFFFFF) #define IP_MF 0x2000 /* "More Fragments" */ #define IP_OFFSET 0x1fff /* "Fragment Offset" */ #define CSUM_MANGLED_0 ((__sum16)0xffff) #define BIT(x) (1 << (x)) struct { __uint(type, BPF_MAP_TYPE_DEVMAP_HASH); __uint(key_size, sizeof(int)); __uint(value_size, sizeof(int)); __uint(max_entries, 64); } xdp_tx_ports SEC(".maps"); struct bpf_flowtable_opts { __s32 error; }; struct flow_offload_tuple_rhash * bpf_xdp_flow_lookup(struct xdp_md *, struct bpf_fib_lookup *, struct bpf_flowtable_opts *, __u32) __ksym; /* from include/net/ip.h */ static __always_inline int ip_decrease_ttl(struct iphdr *iph) { __u32 check = (__u32)iph->check; check += (__u32)bpf_htons(0x0100); iph->check = (__sum16)(check + (check >= 0xFFFF)); return --iph->ttl; } static __always_inline __u32 csum_add(__u32 csum, __u32 addend) { __u32 res = csum + addend; return res + (res < addend); } static __always_inline __u16 csum_fold(__u32 csum) { csum = (csum & 0xffff) + (csum >> 16); csum = (csum & 0xffff) + (csum >> 16); return ~csum; } static __always_inline __u16 csum_replace4(__u32 csum, __u32 from, __u32 to) { __u32 tmp = csum_add(~csum, ~from); return csum_fold(csum_add(tmp, to)); } static __always_inline __u16 csum_replace16(__u32 csum, __u32 *from, __u32 *to) { __u32 diff[] = { ~from[0], ~from[1], ~from[2], ~from[3], to[0], to[1], to[2], to[3], }; csum = bpf_csum_diff(0, 0, diff, sizeof(diff), ~csum); return csum_fold(csum); } static __always_inline int xdp_flowtable_check_tcp_state(void *ports, void *data_end, __u8 proto) { if (proto == IPPROTO_TCP) { struct tcphdr *tcph = ports; if (tcph + 1 > data_end) return -1; if (tcph->fin || tcph->rst) return -1; } return 0; } static __always_inline void xdp_flowtable_update_port_csum(struct flow_ports *ports, void *data_end, __u8 proto, __be16 port, __be16 nat_port) { switch (proto) { case IPPROTO_TCP: { struct tcphdr *tcph = (struct tcphdr *)ports; if (tcph + 1 > data_end) break; tcph->check = csum_replace4((__u32)tcph->check, (__u32)port, (__u32)nat_port); break; } case IPPROTO_UDP: { struct udphdr *udph = (struct udphdr *)ports; if (udph + 1 > data_end) break; if (!udph->check) break; udph->check = csum_replace4((__u32)udph->check, (__u32)port, (__u32)nat_port); if (!udph->check) udph->check = CSUM_MANGLED_0; break; } default: break; } } static __always_inline void xdp_flowtable_snat_port(const struct flow_offload *flow, struct flow_ports *ports, void *data_end, __u8 proto, enum flow_offload_tuple_dir dir) { __be16 port, nat_port; if (ports + 1 > data_end) return; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = ports->source; /* For original direction (FLOW_OFFLOAD_DIR_ORIGINAL): * - tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port contains * the source port used for the traffic transmitted by the * host. * - tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port contains * the destination port used for the traffic transmitted by * the host. */ bpf_core_read(&nat_port, bpf_core_type_size(nat_port), &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port); ports->source = nat_port; break; case FLOW_OFFLOAD_DIR_REPLY: /* For reply direction (FLOW_OFFLOAD_DIR_REPLY): * - tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port * contains source port used for the traffic received by the * host. * - tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port * contains the destination port used for the traffic * received by the host. */ port = ports->dest; bpf_core_read(&nat_port, bpf_core_type_size(nat_port), &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port); ports->dest = nat_port; break; default: return; } xdp_flowtable_update_port_csum(ports, data_end, proto, port, nat_port); } static __always_inline void xdp_flowtable_dnat_port(const struct flow_offload *flow, struct flow_ports *ports, void *data_end, __u8 proto, enum flow_offload_tuple_dir dir) { __be16 port, nat_port; if (ports + 1 > data_end) return; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: /* For original direction (FLOW_OFFLOAD_DIR_ORIGINAL): * - tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port contains * the source port used for the traffic transmitted by the * host. * - tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port contains * the destination port used for the traffic transmitted by * the host. */ port = ports->dest; bpf_core_read(&nat_port, bpf_core_type_size(nat_port), &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port); ports->dest = nat_port; break; case FLOW_OFFLOAD_DIR_REPLY: /* For reply direction (FLOW_OFFLOAD_DIR_REPLY): * - tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port * contains the source port used for the traffic received by * the host. * - tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port * contains destination port used for the traffic received by * the host. */ port = ports->source; bpf_core_read(&nat_port, bpf_core_type_size(nat_port), &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port); ports->source = nat_port; break; default: return; } xdp_flowtable_update_port_csum(ports, data_end, proto, port, nat_port); } static __always_inline void xdp_flowtable_update_ipv4_csum(struct iphdr *iph, void *data_end, __be32 addr, __be32 nat_addr) { switch (iph->protocol) { case IPPROTO_TCP: { struct tcphdr *tcph = (struct tcphdr *)(iph + 1); if (tcph + 1 > data_end) break; tcph->check = csum_replace4((__u32)tcph->check, addr, nat_addr); break; } case IPPROTO_UDP: { struct udphdr *udph = (struct udphdr *)(iph + 1); if (udph + 1 > data_end) break; if (!udph->check) break; udph->check = csum_replace4((__u32)udph->check, addr, nat_addr); if (!udph->check) udph->check = CSUM_MANGLED_0; break; } default: break; } } static __always_inline void xdp_flowtable_snat_ip(const struct flow_offload *flow, struct iphdr *iph, void *data_end, enum flow_offload_tuple_dir dir) { __be32 addr, nat_addr; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = iph->saddr; bpf_core_read(&nat_addr, bpf_core_type_size(nat_addr), &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr); iph->saddr = nat_addr; break; case FLOW_OFFLOAD_DIR_REPLY: addr = iph->daddr; bpf_core_read(&nat_addr, bpf_core_type_size(nat_addr), &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr); iph->daddr = nat_addr; break; default: return; } iph->check = csum_replace4((__u32)iph->check, addr, nat_addr); xdp_flowtable_update_ipv4_csum(iph, data_end, addr, nat_addr); } static __always_inline void xdp_flowtable_get_dnat_ip(__be32 *addr, const struct flow_offload *flow, enum flow_offload_tuple_dir dir) { switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: bpf_core_read(addr, sizeof(*addr), &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr); break; case FLOW_OFFLOAD_DIR_REPLY: bpf_core_read(addr, sizeof(*addr), &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr); break; default: break; } } static __always_inline void xdp_flowtable_dnat_ip(const struct flow_offload *flow, struct iphdr *iph, void *data_end, enum flow_offload_tuple_dir dir) { __be32 addr, nat_addr; xdp_flowtable_get_dnat_ip(&nat_addr, flow, dir); switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = iph->daddr; iph->daddr = nat_addr; break; case FLOW_OFFLOAD_DIR_REPLY: addr = iph->saddr; iph->saddr = nat_addr; break; default: return; } iph->check = csum_replace4((__u32)iph->check, addr, nat_addr); xdp_flowtable_update_ipv4_csum(iph, data_end, addr, nat_addr); } static __always_inline void xdp_flowtable_update_ipv6_csum(struct ipv6hdr *ip6h, void *data_end, struct in6_addr *addr, struct in6_addr *nat_addr) { switch (ip6h->nexthdr) { case IPPROTO_TCP: { struct tcphdr *tcph = (struct tcphdr *)(ip6h + 1); if (tcph + 1 > data_end) break; tcph->check = csum_replace16((__u32)tcph->check, addr->in6_u.u6_addr32, nat_addr->in6_u.u6_addr32); break; } case IPPROTO_UDP: { struct udphdr *udph = (struct udphdr *)(ip6h + 1); if (udph + 1 > data_end) break; if (!udph->check) break; udph->check = csum_replace16((__u32)udph->check, addr->in6_u.u6_addr32, nat_addr->in6_u.u6_addr32); if (!udph->check) udph->check = CSUM_MANGLED_0; break; } default: break; } } static __always_inline void xdp_flowtable_snat_ipv6(const struct flow_offload *flow, struct ipv6hdr *ip6h, void *data_end, enum flow_offload_tuple_dir dir) { struct in6_addr addr, nat_addr; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = ip6h->saddr; bpf_core_read(&nat_addr, bpf_core_type_size(nat_addr), &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6); ip6h->saddr = nat_addr; break; case FLOW_OFFLOAD_DIR_REPLY: addr = ip6h->daddr; bpf_core_read(&nat_addr, bpf_core_type_size(nat_addr), &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6); ip6h->daddr = nat_addr; break; default: return; } xdp_flowtable_update_ipv6_csum(ip6h, data_end, &addr, &nat_addr); } static __always_inline void xdp_flowtable_get_dnat_ipv6(struct in6_addr *addr, const struct flow_offload *flow, enum flow_offload_tuple_dir dir) { switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: bpf_core_read(addr, sizeof(*addr), &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6); break; case FLOW_OFFLOAD_DIR_REPLY: bpf_core_read(addr, sizeof(*addr), &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6); break; default: break; } } static __always_inline void xdp_flowtable_dnat_ipv6(const struct flow_offload *flow, struct ipv6hdr *ip6h, void *data_end, enum flow_offload_tuple_dir dir) { struct in6_addr addr, nat_addr; xdp_flowtable_get_dnat_ipv6(&nat_addr, flow, dir); switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = ip6h->daddr; ip6h->daddr = nat_addr; break; case FLOW_OFFLOAD_DIR_REPLY: addr = ip6h->saddr; ip6h->saddr = nat_addr; break; default: return; } xdp_flowtable_update_ipv6_csum(ip6h, data_end, &addr, &nat_addr); } static __always_inline void xdp_flowtable_forward_ip(const struct flow_offload *flow, void *data, void *data_end, struct flow_ports *ports, enum flow_offload_tuple_dir dir, unsigned long flags) { struct iphdr *iph = data + sizeof(struct ethhdr); if (iph + 1 > data_end) return; if (flags & BIT(NF_FLOW_SNAT)) { xdp_flowtable_snat_port(flow, ports, data_end, iph->protocol, dir); xdp_flowtable_snat_ip(flow, iph, data_end, dir); } if (flags & BIT(NF_FLOW_DNAT)) { xdp_flowtable_dnat_port(flow, ports, data_end, iph->protocol, dir); xdp_flowtable_dnat_ip(flow, iph, data_end, dir); } ip_decrease_ttl(iph); } static __always_inline void xdp_flowtable_forward_ipv6(const struct flow_offload *flow, void *data, void *data_end, struct flow_ports *ports, enum flow_offload_tuple_dir dir, unsigned long flags) { struct ipv6hdr *ip6h = data + sizeof(struct ethhdr); if (ip6h + 1 > data_end) return; if (flags & BIT(NF_FLOW_SNAT)) { xdp_flowtable_snat_port(flow, ports, data_end, ip6h->nexthdr, dir); xdp_flowtable_snat_ipv6(flow, ip6h, data_end, dir); } if (flags & BIT(NF_FLOW_DNAT)) { xdp_flowtable_dnat_port(flow, ports, data_end, ip6h->nexthdr, dir); xdp_flowtable_dnat_ipv6(flow, ip6h, data_end, dir); } ip6h->hop_limit--; } static __always_inline int xdp_flowtable_flags(struct xdp_md *ctx, __u32 fib_flags) { void *data_end = (void *)(long)ctx->data_end; struct flow_offload_tuple_rhash *tuplehash; struct bpf_fib_lookup tuple = { .ifindex = ctx->ingress_ifindex, }; void *data = (void *)(long)ctx->data; struct bpf_flowtable_opts opts = {}; enum flow_offload_tuple_dir dir; struct ethhdr *eth = data; struct flow_offload *flow; struct flow_ports *ports; unsigned long flags; if (eth + 1 > data_end) return XDP_PASS; switch (eth->h_proto) { case bpf_htons(ETH_P_IP): { struct iphdr *iph = data + sizeof(*eth); ports = (struct flow_ports *)(iph + 1); if (ports + 1 > data_end) return XDP_PASS; /* ip fragmented traffic */ if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET)) return XDP_PASS; /* ip options */ if (iph->ihl * 4 != sizeof(*iph)) return XDP_PASS; if (iph->ttl <= 1) return XDP_PASS; if (xdp_flowtable_check_tcp_state(ports, data_end, iph->protocol) < 0) return XDP_PASS; tuple.family = AF_INET; tuple.tos = iph->tos; tuple.l4_protocol = iph->protocol; tuple.tot_len = bpf_ntohs(iph->tot_len); tuple.ipv4_src = iph->saddr; tuple.ipv4_dst = iph->daddr; tuple.sport = ports->source; tuple.dport = ports->dest; break; } case bpf_htons(ETH_P_IPV6): { struct in6_addr *src = (struct in6_addr *)tuple.ipv6_src; struct in6_addr *dst = (struct in6_addr *)tuple.ipv6_dst; struct ipv6hdr *ip6h = data + sizeof(*eth); ports = (struct flow_ports *)(ip6h + 1); if (ports + 1 > data_end) return XDP_PASS; if (ip6h->hop_limit <= 1) return XDP_PASS; if (xdp_flowtable_check_tcp_state(ports, data_end, ip6h->nexthdr) < 0) return XDP_PASS; tuple.family = AF_INET6; tuple.l4_protocol = ip6h->nexthdr; tuple.tot_len = bpf_ntohs(ip6h->payload_len); *src = ip6h->saddr; *dst = ip6h->daddr; tuple.sport = ports->source; tuple.dport = ports->dest; break; } default: return XDP_PASS; } tuplehash = bpf_xdp_flow_lookup(ctx, &tuple, &opts, sizeof(opts)); if (!tuplehash) return XDP_PASS; flow = container_of(tuplehash, struct flow_offload, tuplehash); if (bpf_core_read(&flags, sizeof(flags), &flow->flags)) return XDP_PASS; if (tuplehash->tuple.xmit_type != FLOW_OFFLOAD_XMIT_NEIGH) return XDP_PASS; dir = tuplehash->tuple.dir; if (dir >= FLOW_OFFLOAD_DIR_MAX) return XDP_PASS; /* update the destination address in case of dnatting before * performing the route lookup */ if (tuple.family == AF_INET6) { struct in6_addr *dst_addr = (struct in6_addr *)&tuple.ipv6_dst; xdp_flowtable_get_dnat_ipv6(dst_addr, flow, dir); } else { xdp_flowtable_get_dnat_ip(&tuple.ipv4_dst, flow, dir); } if (bpf_fib_lookup(ctx, &tuple, sizeof(tuple), fib_flags) != BPF_FIB_LKUP_RET_SUCCESS) return XDP_PASS; /* Verify egress index has been configured as TX-port */ if (!bpf_map_lookup_elem(&xdp_tx_ports, &tuple.ifindex)) return XDP_PASS; if (tuple.family == AF_INET6) xdp_flowtable_forward_ipv6(flow, data, data_end, ports, dir, flags); else xdp_flowtable_forward_ip(flow, data, data_end, ports, dir, flags); __builtin_memcpy(eth->h_dest, tuple.dmac, ETH_ALEN); __builtin_memcpy(eth->h_source, tuple.smac, ETH_ALEN); return bpf_redirect_map(&xdp_tx_ports, tuple.ifindex, 0); } SEC("xdp") int xdp_fwd_flow_full(struct xdp_md *ctx) { return xdp_flowtable_flags(ctx, 0); } SEC("xdp") int xdp_fwd_flow_direct(struct xdp_md *ctx) { return xdp_flowtable_flags(ctx, BPF_FIB_LOOKUP_DIRECT); } char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/xdp-forward/xdp_flowtable_sample.bpf.c000066400000000000000000000015211514310632100230050ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 /* Original xdp_fwd sample Copyright (c) 2017-18 David Ahern */ #include #include #include #include #define AF_INET 2 struct bpf_flowtable_opts { __s32 error; }; struct flow_offload_tuple_rhash * bpf_xdp_flow_lookup(struct xdp_md *, struct bpf_fib_lookup *, struct bpf_flowtable_opts *, __u32) __ksym; SEC("xdp") int xdp_fwd_flowtable_sample(struct xdp_md *ctx) { struct flow_offload_tuple_rhash *tuplehash; struct bpf_flowtable_opts opts = {}; struct bpf_fib_lookup tuple = { .family = AF_INET, .ifindex = ctx->ingress_ifindex, }; tuplehash = bpf_xdp_flow_lookup(ctx, &tuple, &opts, sizeof(opts)); if (!tuplehash) return XDP_DROP; return XDP_PASS; } char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/xdp-forward/xdp_forward.bpf.c000066400000000000000000000101151514310632100211300ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 /* Original xdp_fwd sample Copyright (c) 2017-18 David Ahern */ #include #include #include #include #define AF_INET 2 #define AF_INET6 10 #define IPV6_FLOWINFO_MASK bpf_htons(0x0FFFFFFF) struct { __uint(type, BPF_MAP_TYPE_DEVMAP_HASH); __uint(key_size, sizeof(int)); __uint(value_size, sizeof(int)); __uint(max_entries, 64); } xdp_tx_ports SEC(".maps"); /* from include/net/ip.h */ static __always_inline int ip_decrease_ttl(struct iphdr *iph) { __u32 check = (__u32)iph->check; check += (__u32)bpf_htons(0x0100); iph->check = (__sum16)(check + (check >= 0xFFFF)); return --iph->ttl; } static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, __u32 flags) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct bpf_fib_lookup fib_params; struct ethhdr *eth = data; struct ipv6hdr *ip6h; struct iphdr *iph; __u16 h_proto; __u64 nh_off; int rc; nh_off = sizeof(*eth); if (data + nh_off > data_end) return XDP_DROP; __builtin_memset(&fib_params, 0, sizeof(fib_params)); h_proto = eth->h_proto; if (h_proto == bpf_htons(ETH_P_IP)) { iph = data + nh_off; if (iph + 1 > data_end) return XDP_DROP; if (iph->ttl <= 1) return XDP_PASS; fib_params.family = AF_INET; fib_params.tos = iph->tos; fib_params.l4_protocol = iph->protocol; fib_params.sport = 0; fib_params.dport = 0; fib_params.tot_len = bpf_ntohs(iph->tot_len); fib_params.ipv4_src = iph->saddr; fib_params.ipv4_dst = iph->daddr; } else if (h_proto == bpf_htons(ETH_P_IPV6)) { struct in6_addr *src = (struct in6_addr *) fib_params.ipv6_src; struct in6_addr *dst = (struct in6_addr *) fib_params.ipv6_dst; ip6h = data + nh_off; if (ip6h + 1 > data_end) return XDP_DROP; if (ip6h->hop_limit <= 1) return XDP_PASS; fib_params.family = AF_INET6; fib_params.flowinfo = *(__be32 *)ip6h & IPV6_FLOWINFO_MASK; fib_params.l4_protocol = ip6h->nexthdr; fib_params.sport = 0; fib_params.dport = 0; fib_params.tot_len = bpf_ntohs(ip6h->payload_len); *src = ip6h->saddr; *dst = ip6h->daddr; } else { return XDP_PASS; } fib_params.ifindex = ctx->ingress_ifindex; rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); /* * Some rc (return codes) from bpf_fib_lookup() are important, * to understand how this XDP-prog interacts with network stack. * * BPF_FIB_LKUP_RET_NO_NEIGH: * Even if route lookup was a success, then the MAC-addresses are also * needed. This is obtained from arp/neighbour table, but if table is * (still) empty then BPF_FIB_LKUP_RET_NO_NEIGH is returned. To avoid * doing ARP lookup directly from XDP, then send packet to normal * network stack via XDP_PASS and expect it will do ARP resolution. * * BPF_FIB_LKUP_RET_FWD_DISABLED: * The bpf_fib_lookup respect sysctl net.ipv{4,6}.conf.all.forwarding * setting, and will return BPF_FIB_LKUP_RET_FWD_DISABLED if not * enabled this on ingress device. */ if (rc == BPF_FIB_LKUP_RET_SUCCESS) { /* Verify egress index has been configured as TX-port. * (Note: User can still have inserted an egress ifindex that * doesn't support XDP xmit, which will result in packet drops). * * Note: lookup in devmap supported since 0cdbb4b09a0. * If not supported will fail with: * cannot pass map_type 14 into func bpf_map_lookup_elem#1: */ if (!bpf_map_lookup_elem(&xdp_tx_ports, &fib_params.ifindex)) return XDP_PASS; if (h_proto == bpf_htons(ETH_P_IP)) ip_decrease_ttl(iph); else if (h_proto == bpf_htons(ETH_P_IPV6)) ip6h->hop_limit--; __builtin_memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); __builtin_memcpy(eth->h_source, fib_params.smac, ETH_ALEN); return bpf_redirect_map(&xdp_tx_ports, fib_params.ifindex, 0); } return XDP_PASS; } SEC("xdp") int xdp_fwd_fib_full(struct xdp_md *ctx) { return xdp_fwd_flags(ctx, 0); } SEC("xdp") int xdp_fwd_fib_direct(struct xdp_md *ctx) { return xdp_fwd_flags(ctx, BPF_FIB_LOOKUP_DIRECT); } char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/xdp-loader/000077500000000000000000000000001514310632100155035ustar00rootroot00000000000000xdp-tools-1.6.1/xdp-loader/.gitignore000066400000000000000000000000131514310632100174650ustar00rootroot00000000000000xdp-loader xdp-tools-1.6.1/xdp-loader/Makefile000066400000000000000000000003321514310632100171410ustar00rootroot00000000000000# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) TOOL_NAME := xdp-loader USER_TARGETS := xdp-loader TEST_FILE := tests/test-xdp-loader.sh MAN_PAGE := xdp-loader.8 LIB_DIR = ../lib include $(LIB_DIR)/common.mk xdp-tools-1.6.1/xdp-loader/README.org000066400000000000000000000226101514310632100171520ustar00rootroot00000000000000#+EXPORT_FILE_NAME: xdp-loader #+TITLE: xdp-loader #+OPTIONS: ^:nil #+MAN_CLASS_OPTIONS: :section-id "8\" \"DATE\" \"VERSION\" \"XDP program loader" # This file serves both as a README on github, and as the source for the man # page; the latter through the org-mode man page export support. # . # To export the man page, simply use the org-mode exporter; (require 'ox-man) if # it's not available. There's also a Makefile rule to export it. * xdp-loader - an XDP program loader XDP-loader is a simple loader for XDP programs with support for attaching multiple programs to the same interface. To achieve this it exposes the same load and unload semantics exposed by the libxdp library. See the =libxdp(3)= man page for details of how this works, and what kernel features it relies on. ** Running xdp-loader The syntax for running xdp-loader is: #+begin_src sh xdp-loader COMMAND [options] Where COMMAND can be one of: load - load an XDP program on an interface unload - unload an XDP program from an interface status - show current XDP program status features - show XDP features supported by the NIC clean - clean up detached program links in XDP bpffs directory help - show the list of available commands #+end_src Each command, and its options are explained below. Or use =xdp-loader COMMAND --help= to see the options for each command. * The LOAD command The =load= command loads one or more XDP programs onto an interface. The syntax for the =load= command is: =xdp-loader load [options] = Where == is the name of the interface to load the programs onto, and the == is one or more file names containing XDP programs. The programs will be loaded onto the interface in the order of their preference, as specified by the program metadata (see *libxdp(3)*). The supported options are: ** -m, --mode Specifies which mode to load the XDP program to be loaded in. The valid values are 'native', which is the default in-driver XDP mode, 'skb', which causes the so-called /skb mode/ (also known as /generic XDP/) to be used, 'hw' which causes the program to be offloaded to the hardware, or 'unspecified' which leaves it up to the kernel to pick a mode (which it will do by picking native mode if the driver supports it, or generic mode otherwise). Note that using 'unspecified' can make it difficult to predict what mode a program will end up being loaded in. For this reason, the default is 'native'. Note that hardware with support for the 'hw' mode is rare: Netronome/Corigine cards (using the 'nfp' driver) are the only devices with support for this in the mainline Linux kernel. ** -p, --pin-path This specifies a root path under which to pin any maps that define the 'pinning' attribute in their definitions. This path must be located on a =bpffs= file system. If not set, maps will not be pinned, even if they specify pinning in their definitions. When pinning maps, if the pinned location for a map already exist, the map pinned there will be reused if it is compatible with the type of the map being loaded. ** -s, --section
Specify which ELF section to load the XDP program(s) from in each file. The default is to use the first program in each file. If this option is set, it applies to all programs being loaded. ** -n, --prog-name Specify which BPF program with the name to load the XDP program(s) from in each file. The default is to use the first program in each file. Only one of --section and --prog-name may be specified. If this option is set, it applies to all programs being loaded. ** -P, --prio Specify the priority to load the XDP program(s) with (this affects the order of programs running on the interface). The default is to use the value from the metadata in the program ELF file, or a value of 50 if the program has no such metadata. If this option is set, it applies to all programs being loaded. ** -A, --actions Specify the "chain call actions" of the loaded XDP program(s). These are the XDP actions that will cause the next program loaded on the interface to be called, instead of returning immediately. The default is to use the value set in the metadata in the program ELF file, or XDP_PASS if no such metadata is set. If this option is set, it applies to all programs being loaded. ** -v, --verbose Enable debug logging. Specify twice for even more verbosity. ** -h, --help Display a summary of the available options * The UNLOAD command The =unload= command is used for unloading programs from an interface. The syntax for the =unload= command is: =xdp-loader unload [options] = Where == is the name of the interface to load the programs onto. Either the =--all= or =--id= options must be used to specify which program(s) to unload. The supported options are: ** -i, --id Unload a single program from the interface by ID. Use =xdp-loader status= to obtain the ID of the program being unloaded. If this program is the last program loaded on the interface, the dispatcher program will also be removed, which makes the operation equivalent to specifying =--all=. ** -a, --all Unload all XDP programs on the interface, as well as the multi-program dispatcher. ** -v, --verbose Enable debug logging. Specify twice for even more verbosity. ** -h, --help Display a summary of the available options * The STATUS command The =status= command displays a list of interfaces in the system, and the XDP program(s) loaded on each interface. For each interface, a list of programs are shown, with the run priority and "chain actions" for each program. See the section on program metadata for the meaning of this metadata. ** -v, --verbose Enable debug logging. Specify twice for even more verbosity. ** -h, --help Display a summary of the available options * The FEATURES command The =features= command displays the XDP features supported by the NIC. Currently supported XDP features are: ** NETDEV_XDP_ACT_BASIC The networking device has basic support for running XDP programs, and can handle the base set of return codes (XDP_ABORTED, XDP_DROP, XDP_PASS, XDP_TX). ** NETDEV_XDP_ACT_REDIRECT The network device supports handling the XDP_REDIRECT return code. This means packets can be redirected from this device by XDP. ** NETDEV_XDP_ACT_NDO_XMIT The networking interfaces implements the ndo_xdp_xmit callback. This means packets can be redirected to this device by XDP. ** NETDEV_XDP_ACT_XSK_ZEROCOPY The networking device supports AF_XDP in zero copy mode. ** NETDEV_XDP_ACT_HW_OFFLOAD The networking device supports XDP hw offloading. ** NETDEV_XDP_ACT_RX_SG The networking device supports non-linear XDP frames on the receive side. This means XDP can be used with big MTUs on this device (if the XDP program is compiled with fragments support) ** NETDEV_XDP_ACT_NDO_XMIT_SG The networking device supports non-linear XDP frames on the transmit side. This means non-linear frames can be redirected to this device. * The CLEAN command The syntax for the =clean= command is: =xdp-loader clean [options] [ifname]= The =clean= command cleans up any detached program links in the XDP bpffs directory. When a network interface disappears, any programs loaded in software mode (e.g. skb, native) remain pinned in the bpffs directory, but become detached from the interface. These need to be unlinked from the filesystem. The =clean= command takes an optional interface parameter to only unlink detached programs corresponding to the interface. By default, all detached programs for all interfaces are unlinked. The supported options are: ** -v, --verbose Enable debug logging. Specify twice for even more verbosity. ** -h, --help Display a summary of the available options * Examples To load an XDP program on the eth0 interface simply do: #+begin_src sh # xdp-loader load eth0 xdp_drop.o # xdp-loader status CURRENT XDP PROGRAM STATUS: Interface Prio Program name Mode ID Tag Chain actions ------------------------------------------------------------------------------------- lo eth0 xdp_dispatcher native 50 d51e469e988d81da => 50 xdp_drop 55 57cd311f2e27366b XDP_PASS #+end_src Which shows that a dispatcher program was loaded on the interface, and the xdp_drop program was installed as the first (and only) component program after it. In this instance, the program does not specify any of the metadata above, so the defaults (priority 50 and XDP_PASS as its chain call action) was used. To use the automatic map pinning, include the =pinning= attribute into the map definition in the program, something like: #+begin_src c struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 10); __type(key, __u32); __type(value, __u64); __uint(pinning, LIBBPF_PIN_BY_NAME); } my_map SEC(".maps"); #+end_src And load it with the =--pin-path= attribute: #+begin_src sh # xdp-loader load eth0 my_prog.o --pin-path /sys/fs/bpf/my-prog #+end_src This will pin the map at =/sys/fs/bpf/my-prog/my_map=. If this already exists, the pinned map will be reused instead of creating a new one, which allows different BPF programs to share the map. * SEE ALSO =libxdp(3)= for details on the XDP loading semantics and kernel compatibility requirements. * BUGS Please report any bugs on Github: https://github.com/xdp-project/xdp-tools/issues * AUTHOR xdp-loader and this man page were written by Toke Høiland-Jørgensen. xdp-tools-1.6.1/xdp-loader/tests/000077500000000000000000000000001514310632100166455ustar00rootroot00000000000000xdp-tools-1.6.1/xdp-loader/tests/test-xdp-loader.sh000066400000000000000000000070151514310632100222200ustar00rootroot00000000000000XDP_LOADER=${XDP_LOADER:-./xdp-loader} ALL_TESTS="test_load test_section test_prog_name test_load_adjust_tail test_load_multi test_load_incremental test_load_clobber test_features" test_load() { check_run $XDP_LOADER load $NS $TEST_PROG_DIR/xdp_drop.o -vv check_run $XDP_LOADER unload $NS --all -vv } test_section() { check_run $XDP_LOADER load $NS $TEST_PROG_DIR/xdp_drop.o -s xdp -vv check_run $XDP_LOADER unload $NS --all -vv } test_prog_name() { check_run $XDP_LOADER load $NS $TEST_PROG_DIR/xdp_drop.o -n xdp_drop -vv check_run $XDP_LOADER unload $NS --all -vv } test_load_adjust_tail() { check_run $XDP_LOADER load $NS $TEST_PROG_DIR/xdp_adjust_tail.o -vv # Need to load twice to test freplace of both the top-level dispatcher # function as well as sub-functions for multi-prog; but only do this if we # the kernel actually supports loading multiple programs if is_multiprog_supported; then check_run $XDP_LOADER load $NS $TEST_PROG_DIR/xdp_adjust_tail.o -vv fi check_run $XDP_LOADER unload $NS --all -vv } check_progs_loaded() { local iface="$1" local num=$2 local num_loaded num_loaded=$($XDP_LOADER status $NS | grep -c '=>') if [ "$num_loaded" -ne "$num" ]; then echo "Expected $num programs loaded, found $num_loaded" exit 1 fi } test_load_multi() { skip_if_legacy_fallback check_run $XDP_LOADER load $NS $TEST_PROG_DIR/xdp_drop.o $TEST_PROG_DIR/xdp_pass.o -vv check_progs_loaded $NS 2 check_run $XDP_LOADER unload $NS --all -vv } test_load_incremental() { skip_if_legacy_fallback local output local ret local id check_run $XDP_LOADER load $NS $TEST_PROG_DIR/xdp_drop.o -vv check_progs_loaded $NS 1 output=$($XDP_LOADER load $NS $TEST_PROG_DIR/xdp_pass.o -vv 2>&1) ret=$? if [ "$ret" -ne "0" ] && echo $output | grep -q "Falling back to loading single prog"; then ret=$SKIPPED_TEST check_run $XDP_LOADER unload $NS --all -vv else check_progs_loaded $NS 2 id=$($XDP_LOADER status $NS | grep xdp_pass | awk '{print $4}') check_run $XDP_LOADER unload $NS --id $id check_progs_loaded $NS 1 id=$($XDP_LOADER status $NS | grep xdp_drop | awk '{print $4}') check_run $XDP_LOADER unload $NS --id $id check_progs_loaded $NS 0 fi return $ret } test_load_clobber() { skip_if_legacy_fallback check_run env LIBXDP_SKIP_DISPATCHER=1 $XDP_LOADER load $NS $TEST_PROG_DIR/xdp_drop.o -vv check_progs_loaded $NS 0 # legacy prog so should show up as 0 $XDP_LOADER load $NS $TEST_PROG_DIR/xdp_pass.o -vv ret=$? if [ "$ret" -eq "0" ]; then echo "Should not have been able to load prog with legacy prog loaded" return 1 fi check_progs_loaded $NS 0 check_run $XDP_LOADER unload $NS --all -vv } check_xdp_feature() { check_run ip link add dev v0 type veth peer name v1 $XDP_LOADER features v0 | grep "$1" | grep -q "$2" ret=$? ip link del dev v0 [ $ret -eq 1 ] && exit 1 } test_features() { skip_if_missing_kernel_symbol xdp_set_features_flag check_xdp_feature NETDEV_XDP_ACT_BASIC yes check_xdp_feature NETDEV_XDP_ACT_REDIRECT yes check_xdp_feature NETDEV_XDP_ACT_NDO_XMIT no check_xdp_feature NETDEV_XDP_ACT_XSK_ZEROCOPY no check_xdp_feature NETDEV_XDP_ACT_HW_OFFLOAD no check_xdp_feature NETDEV_XDP_ACT_RX_SG yes check_xdp_feature NETDEV_XDP_ACT_NDO_XMIT_SG no return 0 } cleanup_tests() { $XDP_LOADER unload $NS --all >/dev/null 2>&1 } xdp-tools-1.6.1/xdp-loader/xdp-loader.8000066400000000000000000000227501514310632100176410ustar00rootroot00000000000000.TH "xdp-loader" "8" "SEPTEMBER 12, 2024" "V1.6.1" "XDP program loader" .SH "NAME" xdp-loader \- an XDP program loader .SH "SYNOPSIS" .PP XDP-loader is a simple loader for XDP programs with support for attaching multiple programs to the same interface. To achieve this it exposes the same load and unload semantics exposed by the libxdp library. See the \fIlibxdp(3)\fP man page for details of how this works, and what kernel features it relies on. .SS "Running xdp-loader" .PP The syntax for running xdp-loader is: .RS .nf \fCxdp-loader COMMAND [options] Where COMMAND can be one of: load - load an XDP program on an interface unload - unload an XDP program from an interface status - show current XDP program status features - show XDP features supported by the NIC clean - clean up detached program links in XDP bpffs directory help - show the list of available commands \fP .fi .RE .PP Each command, and its options are explained below. Or use \fIxdp\-loader COMMAND \-\-help\fP to see the options for each command. .SH "The LOAD command" .PP The \fIload\fP command loads one or more XDP programs onto an interface. .PP The syntax for the \fIload\fP command is: .PP \fIxdp\-loader load [options] \fP .PP Where \fI\fP is the name of the interface to load the programs onto, and the \fI\fP is one or more file names containing XDP programs. The programs will be loaded onto the interface in the order of their preference, as specified by the program metadata (see \fBlibxdp(3)\fP). .PP The supported options are: .SS "-m, --mode " .PP Specifies which mode to load the XDP program to be loaded in. The valid values are 'native', which is the default in-driver XDP mode, 'skb', which causes the so-called \fIskb mode\fP (also known as \fIgeneric XDP\fP) to be used, 'hw' which causes the program to be offloaded to the hardware, or 'unspecified' which leaves it up to the kernel to pick a mode (which it will do by picking native mode if the driver supports it, or generic mode otherwise). Note that using 'unspecified' can make it difficult to predict what mode a program will end up being loaded in. For this reason, the default is 'native'. Note that hardware with support for the 'hw' mode is rare: Netronome/Corigine cards (using the 'nfp' driver) are the only devices with support for this in the mainline Linux kernel. .SS "-p, --pin-path " .PP This specifies a root path under which to pin any maps that define the 'pinning' attribute in their definitions. This path must be located on a \fIbpffs\fP file system. If not set, maps will not be pinned, even if they specify pinning in their definitions. When pinning maps, if the pinned location for a map already exist, the map pinned there will be reused if it is compatible with the type of the map being loaded. .SS "-s, --section
" .PP Specify which ELF section to load the XDP program(s) from in each file. The default is to use the first program in each file. If this option is set, it applies to all programs being loaded. .SS "-n, --prog-name " .PP Specify which BPF program with the name to load the XDP program(s) from in each file. The default is to use the first program in each file. Only one of --section and --prog-name may be specified. If this option is set, it applies to all programs being loaded. .SS "-P, --prio " .PP Specify the priority to load the XDP program(s) with (this affects the order of programs running on the interface). The default is to use the value from the metadata in the program ELF file, or a value of 50 if the program has no such metadata. If this option is set, it applies to all programs being loaded. .SS "-A, --actions " .PP Specify the "chain call actions" of the loaded XDP program(s). These are the XDP actions that will cause the next program loaded on the interface to be called, instead of returning immediately. The default is to use the value set in the metadata in the program ELF file, or XDP_PASS if no such metadata is set. If this option is set, it applies to all programs being loaded. .SS "-v, --verbose" .PP Enable debug logging. Specify twice for even more verbosity. .SS "-h, --help" .PP Display a summary of the available options .SH "The UNLOAD command" .PP The \fIunload\fP command is used for unloading programs from an interface. .PP The syntax for the \fIunload\fP command is: .PP \fIxdp\-loader unload [options] \fP .PP Where \fI\fP is the name of the interface to load the programs onto. Either the \fI\-\-all\fP or \fI\-\-id\fP options must be used to specify which program(s) to unload. .PP The supported options are: .SS "-i, --id " .PP Unload a single program from the interface by ID. Use \fIxdp\-loader status\fP to obtain the ID of the program being unloaded. If this program is the last program loaded on the interface, the dispatcher program will also be removed, which makes the operation equivalent to specifying \fI\-\-all\fP. .SS "-a, --all" .PP Unload all XDP programs on the interface, as well as the multi-program dispatcher. .SS "-v, --verbose" .PP Enable debug logging. Specify twice for even more verbosity. .SS "-h, --help" .PP Display a summary of the available options .SH "The STATUS command" .PP The \fIstatus\fP command displays a list of interfaces in the system, and the XDP program(s) loaded on each interface. For each interface, a list of programs are shown, with the run priority and "chain actions" for each program. See the section on program metadata for the meaning of this metadata. .SS "-v, --verbose" .PP Enable debug logging. Specify twice for even more verbosity. .SS "-h, --help" .PP Display a summary of the available options .SH "The FEATURES command" .PP The \fIfeatures\fP command displays the XDP features supported by the NIC. .PP Currently supported XDP features are: .SS "NETDEV_XDP_ACT_BASIC" .PP The networking device has basic support for running XDP programs, and can handle the base set of return codes (XDP_ABORTED, XDP_DROP, XDP_PASS, XDP_TX). .SS "NETDEV_XDP_ACT_REDIRECT" .PP The network device supports handling the XDP_REDIRECT return code. This means packets can be redirected from this device by XDP. .SS "NETDEV_XDP_ACT_NDO_XMIT" .PP The networking interfaces implements the ndo_xdp_xmit callback. This means packets can be redirected to this device by XDP. .SS "NETDEV_XDP_ACT_XSK_ZEROCOPY" .PP The networking device supports AF_XDP in zero copy mode. .SS "NETDEV_XDP_ACT_HW_OFFLOAD" .PP The networking device supports XDP hw offloading. .SS "NETDEV_XDP_ACT_RX_SG" .PP The networking device supports non-linear XDP frames on the receive side. This means XDP can be used with big MTUs on this device (if the XDP program is compiled with fragments support) .SS "NETDEV_XDP_ACT_NDO_XMIT_SG" .PP The networking device supports non-linear XDP frames on the transmit side. This means non-linear frames can be redirected to this device. .SH "The CLEAN command" .PP The syntax for the \fIclean\fP command is: .PP \fIxdp\-loader clean [options] [ifname]\fP .PP The \fIclean\fP command cleans up any detached program links in the XDP bpffs directory. When a network interface disappears, any programs loaded in software mode (e.g. skb, native) remain pinned in the bpffs directory, but become detached from the interface. These need to be unlinked from the filesystem. The \fIclean\fP command takes an optional interface parameter to only unlink detached programs corresponding to the interface. By default, all detached programs for all interfaces are unlinked. .PP The supported options are: .SS "-v, --verbose" .PP Enable debug logging. Specify twice for even more verbosity. .SS "-h, --help" .PP Display a summary of the available options .SH "Examples" .PP To load an XDP program on the eth0 interface simply do: .RS .nf \fC# xdp-loader load eth0 xdp_drop.o # xdp-loader status CURRENT XDP PROGRAM STATUS: Interface Prio Program name Mode ID Tag Chain actions ------------------------------------------------------------------------------------- lo eth0 xdp_dispatcher native 50 d51e469e988d81da => 50 xdp_drop 55 57cd311f2e27366b XDP_PASS \fP .fi .RE .PP Which shows that a dispatcher program was loaded on the interface, and the xdp_drop program was installed as the first (and only) component program after it. In this instance, the program does not specify any of the metadata above, so the defaults (priority 50 and XDP_PASS as its chain call action) was used. .PP To use the automatic map pinning, include the \fIpinning\fP attribute into the map definition in the program, something like: .RS .nf \fCstruct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 10); __type(key, __u32); __type(value, __u64); __uint(pinning, LIBBPF_PIN_BY_NAME); } my_map SEC(".maps"); \fP .fi .RE .PP And load it with the \fI\-\-pin\-path\fP attribute: .RS .nf \fC# xdp-loader load eth0 my_prog.o --pin-path /sys/fs/bpf/my-prog \fP .fi .RE .PP This will pin the map at \fI/sys/fs/bpf/my\-prog/my_map\fP. If this already exists, the pinned map will be reused instead of creating a new one, which allows different BPF programs to share the map. .SH "SEE ALSO" .PP \fIlibxdp(3)\fP for details on the XDP loading semantics and kernel compatibility requirements. .SH "BUGS" .PP Please report any bugs on Github: \fIhttps://github.com/xdp-project/xdp-tools/issues\fP .SH "AUTHOR" .PP xdp-loader and this man page were written by Toke Høiland-Jørgensen. xdp-tools-1.6.1/xdp-loader/xdp-loader.c000066400000000000000000000324521514310632100177140ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #include #include #include #include #include #include #include #include #include #include #include #include #include "params.h" #include "logging.h" #include "util.h" #define PROG_NAME "xdp-loader" static const struct loadopt { bool help; struct iface iface; struct multistring filenames; char *pin_path; char *section_name; char *prog_name; enum xdp_attach_mode mode; __u32 prio; __u32 actions; } defaults_load = { .mode = XDP_MODE_NATIVE }; struct enum_val xdp_modes[] = { {"native", XDP_MODE_NATIVE}, {"skb", XDP_MODE_SKB}, {"hw", XDP_MODE_HW}, {"unspecified", XDP_MODE_UNSPEC}, {NULL, 0} }; struct flag_val load_actions[] = { {"XDP_ABORTED", 1U << XDP_ABORTED}, {"XDP_DROP", 1U << XDP_DROP}, {"XDP_PASS", 1U << XDP_PASS}, {"XDP_TX", 1U << XDP_TX}, {"XDP_REDIRECT", 1U << XDP_REDIRECT}, {} }; #define XDP_FEATURE(FLAG) {#FLAG, FLAG} static const struct flag_val xdp_feature_flags[] = { /* NETDEV_XDP features are defined in kernel header */ XDP_FEATURE(NETDEV_XDP_ACT_BASIC), XDP_FEATURE(NETDEV_XDP_ACT_REDIRECT), XDP_FEATURE(NETDEV_XDP_ACT_NDO_XMIT), XDP_FEATURE(NETDEV_XDP_ACT_XSK_ZEROCOPY), XDP_FEATURE(NETDEV_XDP_ACT_HW_OFFLOAD), XDP_FEATURE(NETDEV_XDP_ACT_RX_SG), XDP_FEATURE(NETDEV_XDP_ACT_NDO_XMIT_SG), {NULL, 0} }; #undef XDP_FEATURE static struct prog_option load_options[] = { DEFINE_OPTION("mode", OPT_ENUM, struct loadopt, mode, .short_opt = 'm', .typearg = xdp_modes, .metavar = "", .help = "Load XDP program in ; default native"), DEFINE_OPTION("pin-path", OPT_STRING, struct loadopt, pin_path, .short_opt = 'p', .help = "Path to pin maps under (must be in bpffs)."), DEFINE_OPTION("section", OPT_STRING, struct loadopt, section_name, .metavar = "
", .short_opt = 's', .help = "ELF section name of program to load (default: first in file)."), DEFINE_OPTION("prog-name", OPT_STRING, struct loadopt, prog_name, .metavar = "", .short_opt = 'n', .help = "BPF program name of program to load (default: first in file)."), DEFINE_OPTION("dev", OPT_IFNAME, struct loadopt, iface, .positional = true, .metavar = "", .required = true, .help = "Load on device "), DEFINE_OPTION("filenames", OPT_MULTISTRING, struct loadopt, filenames, .positional = true, .metavar = "", .required = true, .help = "Load programs from "), DEFINE_OPTION("prio", OPT_U32, struct loadopt, prio, .short_opt = 'P', .help = "Set run priority of program"), DEFINE_OPTION("actions", OPT_FLAGS, struct loadopt, actions, .short_opt = 'A', .typearg = load_actions, .metavar = "", .help = "Chain call actions (default: XDP_PASS). e.g. XDP_PASS,XDP_DROP"), END_OPTIONS }; int do_load(const void *cfg, __unused const char *pin_root_path) { const struct loadopt *opt = cfg; struct xdp_program **progs, *p; char errmsg[STRERR_BUFSIZE]; int err = EXIT_SUCCESS; size_t num_progs, i; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, .pin_root_path = opt->pin_path); if (opt->section_name && opt->prog_name) { pr_warn("Only one of --section or --prog-name can be set\n"); return EXIT_FAILURE; } num_progs = opt->filenames.num_strings; if (!num_progs) { pr_warn("Need at least one filename to load\n"); return EXIT_FAILURE; } else if (num_progs > 1 && opt->mode == XDP_MODE_HW) { pr_warn("Cannot attach multiple programs in HW mode\n"); return EXIT_FAILURE; } progs = calloc(num_progs, sizeof(*progs)); if (!progs) { pr_warn("Couldn't allocate memory\n"); return EXIT_FAILURE; } pr_debug("Loading %zu files on interface '%s'.\n", num_progs, opt->iface.ifname); /* libbpf spits out a lot of unhelpful error messages while loading. * Silence the logging so we can provide our own messages instead; this * is a noop if verbose logging is enabled. */ silence_libbpf_logging(); retry: for (i = 0; i < num_progs; i++) { DECLARE_LIBXDP_OPTS(xdp_program_opts, xdp_opts, 0); struct bpf_program *bpf_prog = NULL; p = progs[i]; if (p) xdp_program__close(p); if (opt->prog_name) { xdp_opts.open_filename = opt->filenames.strings[i]; xdp_opts.prog_name = opt->prog_name; xdp_opts.opts = &opts; p = xdp_program__create(&xdp_opts); } else { p = xdp_program__open_file(opt->filenames.strings[i], opt->section_name, &opts); } err = libxdp_get_error(p); if (err) { if (err == -EPERM && !double_rlimit()) goto retry; libxdp_strerror(err, errmsg, sizeof(errmsg)); pr_warn("Couldn't open file '%s': %s\n", opt->filenames.strings[i], errmsg); goto out; } /* Disable autoload for all programs in the bpf object; libxdp * will make sure to turn it back on for the program that we're * actually loading */ bpf_object__for_each_program(bpf_prog, xdp_program__bpf_obj(p)) bpf_program__set_autoload(bpf_prog, false); if (opt->prio) { err = xdp_program__set_run_prio(p, opt->prio); if (err) { pr_warn("Error setting run priority: %u\n", opt->prio); goto out; } } if (opt->actions) { __u32 a; for (a = XDP_ABORTED; a <= XDP_REDIRECT; a++) { err = xdp_program__set_chain_call_enabled(p, a, opt->actions & (1U << a)); if (err) { pr_warn("Error setting chain call action: %u\n", a); goto out; } } } xdp_program__print_chain_call_actions(p, errmsg, sizeof(errmsg)); pr_debug("XDP program %zu: Run prio: %d. Chain call actions: %s\n", i, xdp_program__run_prio(p), errmsg); if (!opt->pin_path) { struct bpf_map *map; bpf_object__for_each_map(map, xdp_program__bpf_obj(p)) { err = bpf_map__set_pin_path(map, NULL); if (err) { pr_warn("Error clearing map pin path: %s\n", strerror(-err)); goto out; } } } progs[i] = p; } err = xdp_program__attach_multi(progs, num_progs, opt->iface.ifindex, opt->mode, 0); if (err) { if (err == -EPERM && !double_rlimit()) goto retry; if (err == -EOPNOTSUPP && (opt->mode == XDP_MODE_NATIVE || opt->mode == XDP_MODE_HW)) { pr_warn("Attaching XDP program in %s mode not supported - try %s mode.\n", opt->mode == XDP_MODE_NATIVE ? "native" : "HW", opt->mode == XDP_MODE_NATIVE ? "SKB" : "native or SKB"); } else { libbpf_strerror(err, errmsg, sizeof(errmsg)); pr_warn("Couldn't attach XDP program on iface '%s': %s(%d)\n", opt->iface.ifname, errmsg, err); } goto out; } out: for (i = 0; i < num_progs; i++) if (progs[i]) xdp_program__close(progs[i]); free(progs); return err; } static const struct unloadopt { bool all; __u32 prog_id; struct iface iface; } defaults_unload = {}; static struct prog_option unload_options[] = { DEFINE_OPTION("dev", OPT_IFNAME, struct unloadopt, iface, .positional = true, .metavar = "", .help = "Unload from device "), DEFINE_OPTION("id", OPT_U32, struct unloadopt, prog_id, .metavar = "", .short_opt = 'i', .help = "Unload program with id "), DEFINE_OPTION("all", OPT_BOOL, struct unloadopt, all, .short_opt = 'a', .help = "Unload all programs from interface"), END_OPTIONS }; int do_unload(const void *cfg, __unused const char *pin_root_path) { const struct unloadopt *opt = cfg; struct xdp_multiprog *mp = NULL; enum xdp_attach_mode mode; int err = EXIT_FAILURE; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, .pin_root_path = pin_root_path); if (!opt->all && !opt->prog_id) { pr_warn("Need prog ID or --all\n"); goto out; } if (!opt->iface.ifindex) { pr_warn("Must specify ifname\n"); goto out; } /* The feature probing done by libxdp makes libbpf output confusing * error messages even on unload. Silence the logging so we can provide * our own messages instead; this is a noop if verbose logging is * enabled. */ silence_libbpf_logging(); mp = xdp_multiprog__get_from_ifindex(opt->iface.ifindex); if (IS_ERR_OR_NULL(mp)) { pr_warn("No XDP program loaded on %s\n", opt->iface.ifname); mp = NULL; goto out; } if (opt->all) { err = xdp_multiprog__detach(mp); if (err) { pr_warn("Unable to detach XDP program: %s\n", strerror(-err)); goto out; } } else { struct xdp_program *prog = NULL; while ((prog = xdp_multiprog__next_prog(prog, mp))) { if (xdp_program__id(prog) == opt->prog_id) { mode = xdp_multiprog__attach_mode(mp); goto found; } } if (xdp_multiprog__is_legacy(mp)) { prog = xdp_multiprog__main_prog(mp); if (xdp_program__id(prog) == opt->prog_id) { mode = xdp_multiprog__attach_mode(mp); goto found; } } prog = xdp_multiprog__hw_prog(mp); if (xdp_program__id(prog) == opt->prog_id) { mode = XDP_MODE_HW; goto found; } pr_warn("Program with ID %u not loaded on %s\n", opt->prog_id, opt->iface.ifname); err = -ENOENT; goto out; found: pr_debug("Detaching XDP program with ID %u from %s\n", xdp_program__id(prog), opt->iface.ifname); err = xdp_program__detach(prog, opt->iface.ifindex, mode, 0); if (err) { pr_warn("Unable to detach XDP program: %s\n", strerror(-err)); goto out; } } out: xdp_multiprog__close(mp); return err ? EXIT_FAILURE : EXIT_SUCCESS; } static const struct statusopt { struct iface iface; } defaults_status = {}; static struct prog_option status_options[] = { DEFINE_OPTION("dev", OPT_IFNAME, struct statusopt, iface, .positional = true, .metavar = "[ifname]", .help = "Show status for device [ifname] (default all interfaces)"), END_OPTIONS }; int do_status(const void *cfg, __unused const char *pin_root_path) { const struct statusopt *opt = cfg; printf("CURRENT XDP PROGRAM STATUS:\n\n"); return iface_print_status(opt->iface.ifindex ? &opt->iface : NULL); } static const struct cleanopt { struct iface iface; } defaults_clean = {}; static struct prog_option clean_options[] = { DEFINE_OPTION("dev", OPT_IFNAME, struct cleanopt, iface, .positional = true, .metavar = "[ifname]", .help = "Clean up detached program links for [ifname] (default all interfaces)"), END_OPTIONS }; int do_clean(const void *cfg, __unused const char *pin_root_path) { const struct cleanopt *opt = cfg; printf("Cleaning up detached XDP program links for %s\n", opt->iface.ifindex ? opt->iface.ifname : "all interfaces"); return libxdp_clean_references(opt->iface.ifindex); } static const struct featuresopt { struct iface iface; } defaults_features = {}; static struct prog_option features_options[] = { DEFINE_OPTION("dev", OPT_IFNAME, struct featuresopt, iface, .positional = true, .metavar = "", .required = true, .help = "Show XDP features for device "), END_OPTIONS }; static int iface_print_xdp_features(const struct iface *iface) { const struct flag_val *flag; __u64 checked_flags = 0; __u64 feature_flags; int err; #ifndef HAVE_LIBBPF_BPF_XDP_QUERY pr_warn("Cannot display features, because xdp-tools was compiled against an " "old version of libbpf without support for querying features.\n"); #endif err = iface_get_xdp_feature_flags(iface->ifindex, &feature_flags); if (err) { pr_warn("Couldn't query XDP features (%d).\n", err); return err; } for (flag = &xdp_feature_flags[0]; flag->flagstring; flag++) { printf("%s:%s%s\n", flag->flagstring, (strlen(flag->flagstring) < 23 ? "\t\t" : "\t"), (feature_flags & flag->flagval ? "yes" : "no")); checked_flags |= flag->flagval; } if (feature_flags & ~checked_flags) pr_debug("unknown reported xdp features: 0x%lx\n", (unsigned long)(feature_flags & ~checked_flags)); return 0; } int do_features(const void *cfg, __unused const char *pin_root_path) { const struct featuresopt *opt = cfg; return iface_print_xdp_features(&opt->iface); } int do_help(__unused const void *cfg, __unused const char *pin_root_path) { fprintf(stderr, "Usage: xdp-loader COMMAND [options]\n" "\n" "COMMAND can be one of:\n" " load - load an XDP program on an interface\n" " unload - unload an XDP program from an interface\n" " status - show current XDP program status\n" " clean - clean up detached program links in XDP bpffs directory\n" " features - show XDP features supported by the NIC\n" " help - show this help message\n" "\n" "Use 'xdp-loader COMMAND --help' to see options for each command\n"); return -1; } static const struct prog_command cmds[] = { DEFINE_COMMAND(load, "Load an XDP program on an interface"), DEFINE_COMMAND(unload, "Unload an XDP program from an interface"), DEFINE_COMMAND(clean, "Clean up detached program links in XDP bpffs directory"), DEFINE_COMMAND(status, "Show XDP program status"), DEFINE_COMMAND(features, "Show NIC XDP features"), { .name = "help", .func = do_help, .no_cfg = true }, END_COMMANDS }; union all_opts { struct loadopt load; struct unloadopt unload; struct statusopt status; struct featuresopt features; }; int main(int argc, char **argv) { if (argc > 1) return dispatch_commands(argv[1], argc - 1, argv + 1, cmds, sizeof(union all_opts), PROG_NAME, false); return do_help(NULL, NULL); } xdp-tools-1.6.1/xdp-monitor/000077500000000000000000000000001514310632100157245ustar00rootroot00000000000000xdp-tools-1.6.1/xdp-monitor/.gitignore000066400000000000000000000000141514310632100177070ustar00rootroot00000000000000xdp-monitor xdp-tools-1.6.1/xdp-monitor/Makefile000066400000000000000000000005371514310632100173710ustar00rootroot00000000000000# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) XDP_TARGETS := xdp_monitor.bpf BPF_SKEL_TARGETS := $(XDP_TARGETS) # Don't install skeleton object files XDP_OBJ_INSTALL := TOOL_NAME := xdp-monitor MAN_PAGE := xdp-monitor.8 TEST_FILE := tests/test-xdp-monitor.sh USER_TARGETS := xdp-monitor LIB_DIR = ../lib include $(LIB_DIR)/common.mk xdp-tools-1.6.1/xdp-monitor/README.org000066400000000000000000000144401514310632100173750ustar00rootroot00000000000000#+EXPORT_FILE_NAME: xdp-monitor #+TITLE: xdp-monitor #+MAN_CLASS_OPTIONS: :section-id "8\" \"DATE\" \"VERSION\" \"A simple XDP monitoring tool" # This file serves both as a README on github, and as the source for the man # page; the latter through the org-mode man page export support. # . # To export the man page, simply use the org-mode exporter; (require 'ox-man) if # it's not available. There's also a Makefile rule to export it. * XDP-monitor - a simple BPF-powered XDP monitoring tool XDP-monitor is a tool that monitors various XDP related statistics and events using BPF tracepoints infrastructure, trying to be as low overhead as possible. Note that by default, statistics for successful XDP redirect events is disabled, as that leads to a per-packet BPF tracing overhead, which while being low overhead, can lead to packet processing degradation. This tool relies on the BPF raw tracepoints infrastructure in the kernel. There is more information on the meaning of the output in both default (terse) and verbose output mode, in the =Output Format Description= section. ** Running xdp-monitor The syntax for running xdp-monitor is: #+begin_src sh xdp-monitor [options] #+end_src The supported options are: ** -i, --interval Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. ** -s, --stats Enable statistics for successful redirection. This option comes with a per packet tracing overhead, for recording all successful redirections. ** -e, --extended Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-\ while the program is running. See also the *Output Format Description* section below. ** -v, --verbose Enable verbose logging. Supply twice to enable verbose logging from the underlying =libxdp= and =libbpf= libraries. ** --version Show the application version and exit. ** -h, --help Display a summary of the available options * Output Format Description By default, redirect success statistics are disabled, use =--stats= to enable. The terse output mode is default, extended output mode can be activated using the =--extended= command line option. SIGQUIT (Ctrl + \\) can be used to switch the mode dynamically at runtime. Terse mode displays at most the following fields: #+begin_src sh rx/s Number of packets received per second redir/s Number of packets successfully redirected per second err,drop/s Aggregated count of errors per second (including dropped packets) xmit/s Number of packets transmitted on the output device per second #+end_src Verbose output mode displays at most the following fields: #+begin_src sh FIELD DESCRIPTION receive Displays the number of packets received and errors encountered Whenever an error or packet drop occurs, details of per CPU error and drop statistics will be expanded inline in terse mode. pkt/s - Packets received per second drop/s - Packets dropped per second error/s - Errors encountered per second redirect - Displays the number of packets successfully redirected Errors encountered are expanded under redirect_err field Note that passing -s to enable it has a per packet overhead redir/s - Packets redirected successfully per second redirect_err Displays the number of packets that failed redirection The errno is expanded under this field with per CPU count The recognized errors are: EINVAL: Invalid redirection ENETDOWN: Device being redirected to is down EMSGSIZE: Packet length too large for device EOPNOTSUPP: Operation not supported ENOSPC: No space in ptr_ring of cpumap kthread error/s - Packets that failed redirection per second enqueue to cpu N Displays the number of packets enqueued to bulk queue of CPU N Expands to cpu:FROM->N to display enqueue stats for each CPU enqueuing to CPU N Received packets can be associated with the CPU redirect program is enqueuing packets to. pkt/s - Packets enqueued per second from other CPU to CPU N drop/s - Packets dropped when trying to enqueue to CPU N bulk-avg - Average number of packets processed for each event kthread Displays the number of packets processed in CPUMAP kthread for each CPU Packets consumed from ptr_ring in kthread, and its xdp_stats (after calling CPUMAP bpf prog) are expanded below this. xdp_stats are expanded as a total and then per-CPU to associate it to each CPU's pinned CPUMAP kthread. pkt/s - Packets consumed per second from ptr_ring drop/s - Packets dropped per second in kthread sched - Number of times kthread called schedule() xdp_stats (also expands to per-CPU counts) pass/s - XDP_PASS count for CPUMAP program execution drop/s - XDP_DROP count for CPUMAP program execution redir/s - XDP_REDIRECT count for CPUMAP program execution xdp_exception Displays xdp_exception tracepoint events This can occur due to internal driver errors, unrecognized XDP actions and due to explicit user trigger by use of XDP_ABORTED Each action is expanded below this field with its count hit/s - Number of times the tracepoint was hit per second devmap_xmit Displays devmap_xmit tracepoint events This tracepoint is invoked for successful transmissions on output device but these statistics are not available for generic XDP mode, hence they will be omitted from the output when using SKB mode xmit/s - Number of packets that were transmitted per second drop/s - Number of packets that failed transmissions per second drv_err/s - Number of internal driver errors per second bulk-avg - Average number of packets processed for each event #+end_src * BUGS Please report any bugs on Github: https://github.com/xdp-project/xdp-tools/issues * AUTHOR The original xdp-monitor tool was written by Jesper Dangaard Brouer. It was then rewritten to support more features by Kumar Kartikeya Dwivedi. This man page was written by Kumar Kartikeya Dwivedi. xdp-tools-1.6.1/xdp-monitor/tests/000077500000000000000000000000001514310632100170665ustar00rootroot00000000000000xdp-tools-1.6.1/xdp-monitor/tests/test-xdp-monitor.sh000066400000000000000000000004101514310632100226520ustar00rootroot00000000000000XDP_LOADER=${XDP_LOADER:-./xdp-loader} XDP_MONITOR=${XDP_MONITOR:-./xdp-monitor} ALL_TESTS="test_monitor" test_monitor() { export XDP_SAMPLE_IMMEDIATE_EXIT=1 check_run $XDP_MONITOR -vv check_run $XDP_MONITOR -s -vv check_run $XDP_MONITOR -e -vv } xdp-tools-1.6.1/xdp-monitor/xdp-monitor.8000066400000000000000000000160051514310632100202770ustar00rootroot00000000000000.TH "xdp-monitor" "8" "DECEMBER 12, 2022" "V1.6.1" "A simple XDP monitoring tool" .SH "NAME" XDP-monitor \- a simple BPF-powered XDP monitoring tool .SH "SYNOPSIS" .PP XDP-monitor is a tool that monitors various XDP related statistics and events using BPF tracepoints infrastructure, trying to be as low overhead as possible. .PP Note that by default, statistics for successful XDP redirect events is disabled, as that leads to a per-packet BPF tracing overhead, which while being low overhead, can lead to packet processing degradation. .PP This tool relies on the BPF raw tracepoints infrastructure in the kernel. .PP There is more information on the meaning of the output in both default (terse) and verbose output mode, in the \fIOutput Format Description\fP section. .SS "Running xdp-monitor" .PP The syntax for running xdp-monitor is: .RS .nf \fCxdp-monitor [options] \fP .fi .RE .PP The supported options are: .SS "-i, --interval " .PP Set the polling interval for collecting all statistics and displaying them to the output. The unit of interval is in seconds. .SS "-s, --stats" .PP Enable statistics for successful redirection. This option comes with a per packet tracing overhead, for recording all successful redirections. .SS "-e, --extended" .PP Start xdp-bench in "extended" output mode. If not set, xdp-bench will start in "terse" mode. The output mode can be switched by hitting C-$\ while the program is running. See also the \fBOutput Format Description\fP section below. .SS "-v, --verbose" .PP Enable verbose logging. Supply twice to enable verbose logging from the underlying \fIlibxdp\fP and \fIlibbpf\fP libraries. .SS "--version" .PP Show the application version and exit. .SS "-h, --help" .PP Display a summary of the available options .SH "Output Format Description" .PP By default, redirect success statistics are disabled, use \fI\-\-stats\fP to enable. The terse output mode is default, extended output mode can be activated using the \fI\-\-extended\fP command line option. .PP SIGQUIT (Ctrl + \\) can be used to switch the mode dynamically at runtime. .PP Terse mode displays at most the following fields: .RS .nf \fCrx/s Number of packets received per second redir/s Number of packets successfully redirected per second err,drop/s Aggregated count of errors per second (including dropped packets) xmit/s Number of packets transmitted on the output device per second \fP .fi .RE .PP Verbose output mode displays at most the following fields: .RS .nf \fCFIELD DESCRIPTION receive Displays the number of packets received and errors encountered Whenever an error or packet drop occurs, details of per CPU error and drop statistics will be expanded inline in terse mode. pkt/s - Packets received per second drop/s - Packets dropped per second error/s - Errors encountered per second redirect - Displays the number of packets successfully redirected Errors encountered are expanded under redirect_err field Note that passing -s to enable it has a per packet overhead redir/s - Packets redirected successfully per second redirect_err Displays the number of packets that failed redirection The errno is expanded under this field with per CPU count The recognized errors are: EINVAL: Invalid redirection ENETDOWN: Device being redirected to is down EMSGSIZE: Packet length too large for device EOPNOTSUPP: Operation not supported ENOSPC: No space in ptr_ring of cpumap kthread error/s - Packets that failed redirection per second enqueue to cpu N Displays the number of packets enqueued to bulk queue of CPU N Expands to cpu:FROM->N to display enqueue stats for each CPU enqueuing to CPU N Received packets can be associated with the CPU redirect program is enqueuing packets to. pkt/s - Packets enqueued per second from other CPU to CPU N drop/s - Packets dropped when trying to enqueue to CPU N bulk-avg - Average number of packets processed for each event kthread Displays the number of packets processed in CPUMAP kthread for each CPU Packets consumed from ptr_ring in kthread, and its xdp_stats (after calling CPUMAP bpf prog) are expanded below this. xdp_stats are expanded as a total and then per-CPU to associate it to each CPU's pinned CPUMAP kthread. pkt/s - Packets consumed per second from ptr_ring drop/s - Packets dropped per second in kthread sched - Number of times kthread called schedule() xdp_stats (also expands to per-CPU counts) pass/s - XDP_PASS count for CPUMAP program execution drop/s - XDP_DROP count for CPUMAP program execution redir/s - XDP_REDIRECT count for CPUMAP program execution xdp_exception Displays xdp_exception tracepoint events This can occur due to internal driver errors, unrecognized XDP actions and due to explicit user trigger by use of XDP_ABORTED Each action is expanded below this field with its count hit/s - Number of times the tracepoint was hit per second devmap_xmit Displays devmap_xmit tracepoint events This tracepoint is invoked for successful transmissions on output device but these statistics are not available for generic XDP mode, hence they will be omitted from the output when using SKB mode xmit/s - Number of packets that were transmitted per second drop/s - Number of packets that failed transmissions per second drv_err/s - Number of internal driver errors per second bulk-avg - Average number of packets processed for each event \fP .fi .RE .SH "BUGS" .PP Please report any bugs on Github: \fIhttps://github.com/xdp-project/xdp-tools/issues\fP .SH "AUTHOR" .PP The original xdp-monitor tool was written by Jesper Dangaard Brouer. It was then rewritten to support more features by Kumar Kartikeya Dwivedi. This man page was written by Kumar Kartikeya Dwivedi. xdp-tools-1.6.1/xdp-monitor/xdp-monitor.c000066400000000000000000000062061514310632100203540ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ static const char *__doc__= "XDP monitor tool, based on tracepoints\n"; static const char *__doc_err_only__= "NOTICE: Only tracking XDP redirect errors\n" " Enable redirect success stats via '-s/--stats'\n" " (which comes with a per packet processing overhead)\n"; #define PROG_NAME "xdp-monitor" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "xdp_monitor.skel.h" #include "params.h" #include "util.h" #include "logging.h" static int mask = SAMPLE_REDIRECT_ERR_CNT | SAMPLE_CPUMAP_ENQUEUE_CNT | SAMPLE_CPUMAP_KTHREAD_CNT | SAMPLE_EXCEPTION_CNT | SAMPLE_DEVMAP_XMIT_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI; DEFINE_SAMPLE_INIT(xdp_monitor); static const struct monitoropt { bool stats; bool extended; __u32 interval; } defaults_monitoropt = { .stats = false, .interval = 2 }; static struct prog_option xdpmonitor_options[] = { DEFINE_OPTION("interval", OPT_U32, struct monitoropt, interval, .short_opt = 'i', .metavar = "", .help = "Polling interval (default 2)"), DEFINE_OPTION("stats", OPT_BOOL, struct monitoropt, stats, .short_opt = 's', .help = "Enable statistics for transmitted packets (not just errors)"), DEFINE_OPTION("extended", OPT_BOOL, struct monitoropt, extended, .short_opt = 'e', .help = "Start running in extended output mode (C^\\ to toggle)"), END_OPTIONS }; int main(int argc, char **argv) { int ret = EXIT_FAIL_OPTION; struct monitoropt cfg = {}; struct xdp_monitor *skel; if (parse_cmdline_args(argc, argv, xdpmonitor_options, &cfg, sizeof(cfg), PROG_NAME, PROG_NAME, __doc__, &defaults_monitoropt) != 0) return ret; /* If all the options are parsed ok, make sure we are root! */ if (check_bpf_environ()) return ret; skel = xdp_monitor__open(); if (!skel) { pr_warn("Failed to xdp_monitor__open: %s\n", strerror(errno)); return EXIT_FAIL_BPF; } ret = sample_init_pre_load(skel, NULL); if (ret < 0) { pr_warn("Failed to sample_init_pre_load: %s\n", strerror(-ret)); ret = EXIT_FAIL_BPF; goto end_destroy; } ret = xdp_monitor__load(skel); if (ret < 0) { pr_warn("Failed to xdp_monitor__load: %s\n", strerror(errno)); ret = EXIT_FAIL_BPF; goto end_destroy; } if (cfg.stats) mask |= SAMPLE_REDIRECT_CNT; else printf("%s", __doc_err_only__); if (cfg.extended) sample_switch_mode(); ret = sample_init(skel, mask, 0, 0); if (ret < 0) { pr_warn("Failed to initialize sample: %s\n", strerror(-ret)); ret = EXIT_FAIL_BPF; goto end_destroy; } ret = sample_run(cfg.interval, NULL, NULL); if (ret < 0) { pr_warn("Failed during sample run: %s\n", strerror(-ret)); ret = EXIT_FAIL; goto end_destroy; } ret = EXIT_OK; end_destroy: xdp_monitor__destroy(skel); sample_teardown(); return ret; } xdp-tools-1.6.1/xdp-monitor/xdp_monitor.bpf.c000066400000000000000000000004051514310632100211770ustar00rootroot00000000000000// SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc. * * XDP monitor tool, based on tracepoints */ #include #include char _license[] SEC("license") = "GPL"; xdp-tools-1.6.1/xdp-trafficgen/000077500000000000000000000000001514310632100163455ustar00rootroot00000000000000xdp-tools-1.6.1/xdp-trafficgen/.gitignore000066400000000000000000000000171514310632100203330ustar00rootroot00000000000000xdp-trafficgen xdp-tools-1.6.1/xdp-trafficgen/Makefile000066400000000000000000000006111514310632100200030ustar00rootroot00000000000000# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) XDP_TARGETS := xdp_trafficgen.bpf BPF_SKEL_TARGETS := $(XDP_TARGETS) # Don't install skeleton object files XDP_OBJ_INSTALL := TOOL_NAME := xdp-trafficgen USER_TARGETS := xdp-trafficgen MAN_PAGE := xdp-trafficgen.8 EXTRA_DEPS := xdp-trafficgen.h TEST_FILE := tests/test-xdp-trafficgen.sh LIB_DIR = ../lib include $(LIB_DIR)/common.mk xdp-tools-1.6.1/xdp-trafficgen/README.org000066400000000000000000000156151514310632100200230ustar00rootroot00000000000000#+EXPORT_FILE_NAME: xdp-trafficgen #+TITLE: xdp-trafficgen #+OPTIONS: ^:nil #+MAN_CLASS_OPTIONS: :section-id "8\" \"DATE\" \"VERSION\" \"An XDP-based traffic generator" # This file serves both as a README on github, and as the source for the man # page; the latter through the org-mode man page export support. # . # To export the man page, simply use the org-mode exporter; (require 'ox-man) if # it's not available. There's also a Makefile rule to export it. * XDP-trafficgen - an XDP-based traffic generator XDP-trafficgen is a packet generator utilising the XDP kernel subsystem to generate packets transmit them through a network interface. Packets are dynamically generated and transmitted in the kernel, allowing for high performance (millions of packets per second per core). XDP-trafficgen supports generating UDP traffic with fixed or dynamic destination ports, and also has basic support for generating dummy TCP traffic on a single flow. ** Running xdp-traffigen The syntax for running xdp-trafficgen is: #+begin_src sh Usage: xdp-trafficgen COMMAND [options] COMMAND can be one of: udp - run in UDP mode tcp - run in TCP mode probe - probe kernel support #+end_src Each command, and its options are explained below. Or use =xdp-trafficgen COMMAND --help= to see the options for each command. * The UDP command The UDP command generates UDP traffic to a given destination IP and either a fixed destination port, or a range of port numbers. Only IPv6 traffic is supported, and the generated packets will have their IP hop limit set to 1, so they can't be routed. The syntax for the =udp= command is: =xdp-trafficgen udp [options] = Where == is the name of the destination interface that packets will be transmitted on. Note that the network driver of this network interface must support being the target of XDP redirects (it must implement the =ndo_xdp_xmit= driver operation). The supported options are: ** -m, --dst-mac Set the destination MAC address of generated packets. The default is to generate packets with an all-zero destination MAC. ** -M, --src-mac Set the source MAC address of the generated packets. The default is to use the MAC address of the interface packets are transmitted on. ** -a, --dst-addr Destination IP address of generated packets. The default is the link-local =fe80::2= address. ** -A, --src-addr Source IP address of generated packets. The default is the link-local =fe80::1= address. ** -p, --dst-port Destination UDP port of generated packets, or the first port in the range if running with =--dyn-ports= set. Defaults to 1. ** -P, --src-port Source UDP port of generated packets. Defaults to 1. ** -d, --dyn-ports Enable dynamic port mode where the destination port is varied over a range of == starting from the =--dst-port=. ** -n, --num-packets Number of packets to send before exiting. If not supplied, =xdp-trafficgen= will keep sending packets until interrupted. ** -s, --pkt-size Size of each UDP packet being sent, including the Ethernet header. The minimum size, which is also the default, is 64 bytes. ** -t, --threads Number of simultaneous threads to transmit from. Each thread will be pinned to a separate CPU core if possible. Defaults to 1. ** -I, --interval Output transmission statistics with this interval (in seconds). ** -v, --verbose Enable verbose logging (-vv: more verbose). ** --version Display version information and exit. ** -h, --help Display a summary of the available options * The TCP command The TCP command generates dummy TCP traffic in a single TCP flow. This relies on first installing an ingress XDP program on the interface used to transmit on. Then, a regular TCP socket connection is established from userspace, and once the handshake is completed, the XDP program will take over and start generating traffic on that flow tuple. The ingress XDP program will intercept ACK packets from the receiver, and keep track of the receive window. The traffic generator has no congestion control, and only very basic retransmit tracking: in essence, any duplicate ACKs from the receiver will cause the sender to reset its send sequence number to the last ACKed value and restart from there. The same thing happens if no progress on the window is made within two seconds. This means that the traffic generator can generate a large amount of dummy traffic, but if there's packet loss a lot of this can be retransmissions. The syntax for the =tcp= command is: =xdp-trafficgen tcp [options] -i = Where == is the name of the destination interface that packets will be transmitted on and == is the peer hostname or IP address to connect to (only IPv6 is supported). Note that the network driver of this network interface must support being the target of XDP redirects (it must implement the =ndo_xdp_xmit= driver operation). The supported options are: ** -p, --dst-port Connect to destination . Default 10000. ** -m, --mode Load ingress XDP program in ; default native (valid values: native,skb,hw) ** -n, --num-packets Number of packets to send before exiting. If not supplied, =xdp-trafficgen= will keep sending packets until interrupted. ** -I, --interval Output transmission statistics with this interval (in seconds). ** -v, --verbose Enable verbose logging (-vv: more verbose). ** --version Display version information and exit. ** -h, --help Display a summary of the available options * The PROBE command The PROBE command probes the kernel to discover whether it supports the features needed to run xdp-trafficgen. This can be used to discover whether the running kernel is compatible with xdp-trafficgen, as well as to discover whether a particular interface supports sending packets generated by xdp-trafficgen. The syntax for the =probe= command is: =xdp-trafficgen probe [options]= After probing, xdp-trafficgen will print the results of the probing of the kernel, and (if an interface name is supplied), whether the interface supports sending packets generated by xdp-trafficgen. The exit code will be 0 if all probes succeed, and 1 otherwise. Note that the probing relies on the kernel XDP feature reporting, which was added after the XDP support itself. This means that with some older kernel versions, the probing may fail even though xdp-trafficgen would in fact be able to transmit packets. The supported options are: ** -i, --ifname Probe the supplied interface name, in addition to doing the general kernel probing. ** -v, --verbose Enable verbose logging (-vv: more verbose). ** --version Display version information and exit. ** -h, --help Display a summary of the available options * BUGS Please report any bugs on Github: https://github.com/xdp-project/xdp-tools/issues * AUTHOR xdp-trafficgen and this man page were written by Toke Høiland-Jørgensen. xdp-tools-1.6.1/xdp-trafficgen/tests/000077500000000000000000000000001514310632100175075ustar00rootroot00000000000000xdp-tools-1.6.1/xdp-trafficgen/tests/test-xdp-trafficgen.sh000066400000000000000000000054631514310632100237310ustar00rootroot00000000000000XDP_LOADER=${XDP_LOADER:-./xdp-loader} XDP_TRAFFICGEN=${XDP_TRAFFICGEN:-./xdp-trafficgen} ALL_TESTS="test_udp test_tcp test_no_support test_xsk_udp" PIDS="" skip_if_missing_kernel_support() { $XDP_TRAFFICGEN probe || exit $SKIPPED_TEST } skip_if_missing_kernel_features() { out=$($XDP_TRAFFICGEN probe -i $NS 2>&1) ERR_REGEX1="Interface $NS does not support sending packets via XDP." ERR_REGEX2="Couldn't query XDP features for interface $NS" if [[ $out =~ $ERR_REGEX1 ]] || [[ $out =~ $ERR_REGEX2 ]]; then exit $SKIPPED_TEST fi } test_udp() { skip_if_missing_kernel_support export XDP_SAMPLE_IMMEDIATE_EXIT=1 check_run $XDP_TRAFFICGEN udp $NS -n 1 } test_xsk_one() { action=$1 shift export XDP_SAMPLE_IMMEDIATE_EXIT=1 check_run ip link add dev btest0 type veth peer name btest1 check_run $XDP_TRAFFICGEN $action btest0 "$@" -vv ip link del dev btest0 } test_xsk_udp() { local action local res local hugepg action=xsk-udp test_xsk_one $action test_xsk_one $action --no-need-wakeup test_xsk_one $action --shared-umem test_xsk_one $action -M aa:bb:cc:dd:ee:ff test_xsk_one $action -P 0x12345678 test_xsk_one $action -Q test_xsk_one $action -T 1000 test_xsk_one $action -V test_xsk_one $action -W SCHED_FIFO -U 50 test_xsk_one $action -b 32 test_xsk_one $action -c 1 test_xsk_one $action -c copy test_xsk_one $action -d 1 test_xsk_one $action -f 2048 test_xsk_one $action -m aa:bb:cc:dd:ee:ff test_xsk_one $action -p test_xsk_one $action -q 0 test_xsk_one $action -s 1024 hugepg=$(cat /proc/sys/vm/nr_hugepages) if [ "$hugepg" -lt "8" ]; then echo 8 > /proc/sys/vm/nr_hugepages res=$? else res=0 fi if [ "$res" = "0" ]; then test_xsk_one $action -u echo $hugepg > /proc/sys/vm/nr_hugepages fi test_xsk_one $action -w BOOTTIME test_xsk_one $action -w MONOTONIC test_xsk_one $action -x -a test_xsk_one $action -y } test_tcp() { skip_if_missing_kernel_support export XDP_SAMPLE_IMMEDIATE_EXIT=1 PID=$(start_background_ns_devnull "socat -6 TCP-LISTEN:10000,reuseaddr,fork -") $XDP_TRAFFICGEN tcp -i $NS $INSIDE_IP6 -n 1 res=$? stop_background $PID return $res } test_no_support() { skip_if_missing_kernel_support skip_if_missing_kernel_features export XDP_SAMPLE_IMMEDIATE_EXIT=1 ip link add dev xdptest0 type veth || return 1 out=$($XDP_TRAFFICGEN udp xdptest0 -n 1 2>&1) err=$? ERR_REGEX="Interface xdptest0 does not support sending packets via XDP." if [ $err -eq 0 ] || ! [[ $out =~ $ERR_REGEX ]]; then echo $out return 1 fi } cleanup_tests() { $XDP_LOADER unload $NS --all >/dev/null 2>&1 $XDP_LOADER clean >/dev/null 2>&1 } xdp-tools-1.6.1/xdp-trafficgen/xdp-trafficgen.8000066400000000000000000000156131514310632100213450ustar00rootroot00000000000000.TH "xdp-trafficgen" "8" "AUGUST 8, 2025" "V1.6.1" "An XDP-based traffic generator" .SH "NAME" XDP-trafficgen \- an XDP-based traffic generator .SH "SYNOPSIS" .PP XDP-trafficgen is a packet generator utilising the XDP kernel subsystem to generate packets transmit them through a network interface. Packets are dynamically generated and transmitted in the kernel, allowing for high performance (millions of packets per second per core). .PP XDP-trafficgen supports generating UDP traffic with fixed or dynamic destination ports, and also has basic support for generating dummy TCP traffic on a single flow. .SS "Running xdp-traffigen" .PP The syntax for running xdp-trafficgen is: .RS .nf \fCUsage: xdp-trafficgen COMMAND [options] COMMAND can be one of: udp - run in UDP mode tcp - run in TCP mode probe - probe kernel support \fP .fi .RE .PP Each command, and its options are explained below. Or use \fIxdp\-trafficgen COMMAND \-\-help\fP to see the options for each command. .SH "The UDP command" .PP The UDP command generates UDP traffic to a given destination IP and either a fixed destination port, or a range of port numbers. Only IPv6 traffic is supported, and the generated packets will have their IP hop limit set to 1, so they can't be routed. .PP The syntax for the \fIudp\fP command is: .PP \fIxdp\-trafficgen udp [options] \fP .PP Where \fI\fP is the name of the destination interface that packets will be transmitted on. Note that the network driver of this network interface must support being the target of XDP redirects (it must implement the \fIndo_xdp_xmit\fP driver operation). .PP The supported options are: .SS "-m, --dst-mac " .PP Set the destination MAC address of generated packets. The default is to generate packets with an all-zero destination MAC. .SS "-M, --src-mac " .PP Set the source MAC address of the generated packets. The default is to use the MAC address of the interface packets are transmitted on. .SS "-a, --dst-addr " .PP Destination IP address of generated packets. The default is the link-local \fIfe80::2\fP address. .SS "-A, --src-addr " .PP Source IP address of generated packets. The default is the link-local \fIfe80::1\fP address. .SS "-p, --dst-port " .PP Destination UDP port of generated packets, or the first port in the range if running with \fI\-\-dyn\-ports\fP set. Defaults to 1. .SS "-P, --src-port " .PP Source UDP port of generated packets. Defaults to 1. .SS "-d, --dyn-ports " .PP Enable dynamic port mode where the destination port is varied over a range of \fI\fP starting from the \fI\-\-dst\-port\fP. .SS "-n, --num-packets " .PP Number of packets to send before exiting. If not supplied, \fIxdp\-trafficgen\fP will keep sending packets until interrupted. .SS "-s, --pkt-size " .PP Size of each UDP packet being sent, including the Ethernet header. The minimum size, which is also the default, is 64 bytes. .SS "-t, --threads " .PP Number of simultaneous threads to transmit from. Each thread will be pinned to a separate CPU core if possible. Defaults to 1. .SS "-I, --interval " .PP Output transmission statistics with this interval (in seconds). .SS "-v, --verbose" .PP Enable verbose logging (-vv: more verbose). .SS "--version" .PP Display version information and exit. .SS "-h, --help" .PP Display a summary of the available options .SH "The TCP command" .PP The TCP command generates dummy TCP traffic in a single TCP flow. This relies on first installing an ingress XDP program on the interface used to transmit on. Then, a regular TCP socket connection is established from userspace, and once the handshake is completed, the XDP program will take over and start generating traffic on that flow tuple. The ingress XDP program will intercept ACK packets from the receiver, and keep track of the receive window. .PP The traffic generator has no congestion control, and only very basic retransmit tracking: in essence, any duplicate ACKs from the receiver will cause the sender to reset its send sequence number to the last ACKed value and restart from there. The same thing happens if no progress on the window is made within two seconds. This means that the traffic generator can generate a large amount of dummy traffic, but if there's packet loss a lot of this can be retransmissions. .PP The syntax for the \fItcp\fP command is: .PP \fIxdp\-trafficgen tcp [options] \-i \fP .PP Where \fI\fP is the name of the destination interface that packets will be transmitted on and \fI\fP is the peer hostname or IP address to connect to (only IPv6 is supported). Note that the network driver of this network interface must support being the target of XDP redirects (it must implement the \fIndo_xdp_xmit\fP driver operation). .PP The supported options are: .SS "-p, --dst-port " .PP Connect to destination . Default 10000. .SS "-m, --mode " .PP Load ingress XDP program in ; default native (valid values: native,skb,hw) .SS "-n, --num-packets " .PP Number of packets to send before exiting. If not supplied, \fIxdp\-trafficgen\fP will keep sending packets until interrupted. .SS "-I, --interval " .PP Output transmission statistics with this interval (in seconds). .SS "-v, --verbose" .PP Enable verbose logging (-vv: more verbose). .SS "--version" .PP Display version information and exit. .SS "-h, --help" .PP Display a summary of the available options .SH "The PROBE command" .PP The PROBE command probes the kernel to discover whether it supports the features needed to run xdp-trafficgen. This can be used to discover whether the running kernel is compatible with xdp-trafficgen, as well as to discover whether a particular interface supports sending packets generated by xdp-trafficgen. .PP The syntax for the \fIprobe\fP command is: .PP \fIxdp\-trafficgen probe [options]\fP .PP After probing, xdp-trafficgen will print the results of the probing of the kernel, and (if an interface name is supplied), whether the interface supports sending packets generated by xdp-trafficgen. The exit code will be 0 if all probes succeed, and 1 otherwise. .PP Note that the probing relies on the kernel XDP feature reporting, which was added after the XDP support itself. This means that with some older kernel versions, the probing may fail even though xdp-trafficgen would in fact be able to transmit packets. .PP The supported options are: .SS "-i, --ifname " .PP Probe the supplied interface name, in addition to doing the general kernel probing. .SS "-v, --verbose" .PP Enable verbose logging (-vv: more verbose). .SS "--version" .PP Display version information and exit. .SS "-h, --help" .PP Display a summary of the available options .SH "BUGS" .PP Please report any bugs on Github: \fIhttps://github.com/xdp-project/xdp-tools/issues\fP .SH "AUTHOR" .PP xdp-trafficgen and this man page were written by Toke Høiland-Jørgensen. xdp-tools-1.6.1/xdp-trafficgen/xdp-trafficgen.c000066400000000000000000001060261514310632100214170ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "params.h" #include "logging.h" #include "util.h" #include "xdp_sample.h" #include "xdp-trafficgen.h" #include "xdpsock.h" #include "xdp_trafficgen.skel.h" #define PROG_NAME "xdp-trafficgen" #ifndef BPF_F_TEST_XDP_LIVE_FRAMES #define BPF_F_TEST_XDP_LIVE_FRAMES (1U << 1) #endif #define IFINDEX_LO 1 static int mask = SAMPLE_DEVMAP_XMIT_CNT_MULTI | SAMPLE_DROP_OK; DEFINE_SAMPLE_INIT(xdp_trafficgen); static bool status_exited = false; struct enum_val xdp_modes[] = { {"native", XDP_MODE_NATIVE}, {"skb", XDP_MODE_SKB}, {"hw", XDP_MODE_HW}, {NULL, 0} }; static const char *driver_pass_list[] = { "bnxt", "ena", "gve", "i40e", "ice", "igb", "igc", "ixgbe", "octeontx2", "stmmac", "mlx5_core", }; static bool driver_needs_xdp_pass(const struct iface *iface) { const char *name = get_driver_name(iface->ifindex); struct xdp_multiprog *mp; __u64 feature_flags; size_t i; int err; /* If the interface already has the NDO_XMIT feature, we don't need to load anything */ err = iface_get_xdp_feature_flags(iface->ifindex, &feature_flags); if (!err && feature_flags & NETDEV_XDP_ACT_NDO_XMIT) return false; mp = xdp_multiprog__get_from_ifindex(iface->ifindex); if (!IS_ERR_OR_NULL(mp)) { pr_debug("Interface %s already has an XDP program loaded\n", iface->ifname); xdp_multiprog__close(mp); return false; } for (i = 0; i < ARRAY_SIZE(driver_pass_list); i++) { if (!strcmp(name, driver_pass_list[i])) { pr_debug("Driver %s on interface %s needs an xdp_pass program to use XDP_REDIRECT\n", name, iface->ifname); return true; } } return false; } static int check_iface_support(const struct iface *iface) { __u64 feature_flags = 0; int err; err = iface_get_xdp_feature_flags(iface->ifindex, &feature_flags); if (err || !feature_flags) { /* The libbpf query function, doesn't distinguish between * "querying is not supported" and "no feature flags are set", * so treat a 0-value feature_flags as a failure to query * instead of refuring to run because the NDO_XMIT bit is not * set. */ pr_warn("Couldn't query XDP features for interface %s (%d).\n" "Continuing anyway, but running may fail!\n", iface->ifname, -err); } else if (!(feature_flags & NETDEV_XDP_ACT_NDO_XMIT)) { pr_warn("Interface %s does not support sending packets via XDP.\n", iface->ifname); return -EOPNOTSUPP; } return 0; } struct udp_packet { struct ethhdr eth; struct ipv6hdr iph; struct udphdr udp; __u8 payload[64 - sizeof(struct udphdr) - sizeof(struct ethhdr) - sizeof(struct ipv6hdr)]; } __attribute__((__packed__)); static struct udp_packet pkt_udp = { .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), .iph.version = 6, .iph.nexthdr = IPPROTO_UDP, .iph.payload_len = bpf_htons(sizeof(struct udp_packet) - offsetof(struct udp_packet, udp)), .iph.hop_limit = 1, .iph.saddr.s6_addr16 = {bpf_htons(0xfe80), 0, 0, 0, 0, 0, 0, bpf_htons(1)}, .iph.daddr.s6_addr16 = {bpf_htons(0xfe80), 0, 0, 0, 0, 0, 0, bpf_htons(2)}, .udp.source = bpf_htons(1), .udp.dest = bpf_htons(1), .udp.len = bpf_htons(sizeof(struct udp_packet) - offsetof(struct udp_packet, udp)), }; struct thread_config { void *pkt; size_t pkt_size; __u32 cpu_core_id; __u32 num_pkts; __u32 batch_size; struct xdp_program *prog; }; static int run_prog(const struct thread_config *cfg, bool *status_var) { #ifdef HAVE_LIBBPF_BPF_PROG_TEST_RUN_OPTS struct xdp_md ctx_in = { .data_end = cfg->pkt_size, }; DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, .data_in = cfg->pkt, .data_size_in = cfg->pkt_size, .ctx_in = &ctx_in, .ctx_size_in = sizeof(ctx_in), .repeat = cfg->num_pkts ?: 1 << 20, .flags = BPF_F_TEST_XDP_LIVE_FRAMES, .batch_size = cfg->batch_size, ); __u64 iterations = 0; cpu_set_t cpu_cores; int err; CPU_ZERO(&cpu_cores); CPU_SET(cfg->cpu_core_id, &cpu_cores); pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_cores); do { err = xdp_program__test_run(cfg->prog, &opts, 0); if (err) return -errno; iterations += opts.repeat; } while (!*status_var && (!cfg->num_pkts || cfg->num_pkts > iterations)); return 0; #else __unused const void *c = cfg, *s = status_var; return -EOPNOTSUPP; #endif } static void *run_traffic(void *arg) { const struct thread_config *cfg = arg; int err; err = run_prog(cfg, &status_exited); if (err) pr_warn("Couldn't run trafficgen program: %s\n", strerror(-err)); kill(getpid(), SIGINT); return NULL; } static int probe_kernel_support(void) { DECLARE_LIBXDP_OPTS(xdp_program_opts, opts); struct xdp_trafficgen *skel; struct xdp_program *prog; __u8 data[ETH_HLEN] = {}; bool status = 0; int err; skel = xdp_trafficgen__open(); if (!skel) { err = -errno; pr_warn("Couldn't open XDP program: %s\n", strerror(-err)); return err; } err = sample_init_pre_load(skel, "lo"); if (err < 0) { pr_warn("Failed to sample_init_pre_load: %s\n", strerror(-err)); goto out; } opts.obj = skel->obj; opts.prog_name = "xdp_drop"; prog = xdp_program__create(&opts); if (!prog) { err = -errno; pr_warn("Couldn't load XDP program: %s\n", strerror(-err)); goto out; } const struct thread_config cfg = { .pkt = data, .pkt_size = sizeof(data), .num_pkts = 1, .batch_size = 1, .prog = prog }; err = run_prog(&cfg, &status); if (err == -EOPNOTSUPP) { pr_warn("BPF_PROG_RUN with batch size support is missing from libbpf.\n"); } else if (err == -EINVAL) { err = -EOPNOTSUPP; pr_warn("Kernel doesn't support live packet mode for XDP BPF_PROG_RUN.\n"); } else if (err) { pr_warn("Error probing kernel support: %s\n", strerror(-err)); } xdp_program__close(prog); out: xdp_trafficgen__destroy(skel); return err; } static int create_runners(pthread_t **runner_threads, struct thread_config **thread_configs, int num_threads, struct thread_config *tcfg, struct xdp_program *prog) { struct thread_config *t; pthread_t *threads; int i, err; threads = calloc(num_threads, sizeof(pthread_t)); if (!threads) { pr_warn("Couldn't allocate memory\n"); return -ENOMEM; } t = calloc(num_threads, sizeof(struct thread_config)); if (!t) { pr_warn("Couldn't allocate memory\n"); free(threads); return -ENOMEM; } for (i = 0; i < num_threads; i++) { memcpy(&t[i], tcfg, sizeof(*tcfg)); tcfg->cpu_core_id++; t[i].prog = xdp_program__clone(prog, 0); err = libxdp_get_error(t[i].prog); if (err) { pr_warn("Failed to clone xdp_program: %s\n", strerror(-err)); t[i].prog = NULL; goto err; } err = pthread_create(&threads[i], NULL, run_traffic, &t[i]); if (err < 0) { pr_warn("Failed to create traffic thread: %s\n", strerror(-err)); goto err; } } *runner_threads = threads; *thread_configs = t; return 0; err: for (i = 0; i < num_threads; i++) { pthread_cancel(threads[i]); xdp_program__close(t[i].prog); } free(t); free(threads); return err; } static __be16 calc_udp_cksum(const struct udp_packet *pkt) { __u32 chksum = pkt->iph.nexthdr + bpf_ntohs(pkt->iph.payload_len); int i; for (i = 0; i < 8; i++) { chksum += bpf_ntohs(pkt->iph.saddr.s6_addr16[i]); chksum += bpf_ntohs(pkt->iph.daddr.s6_addr16[i]); } chksum += bpf_ntohs(pkt->udp.source); chksum += bpf_ntohs(pkt->udp.dest); chksum += bpf_ntohs(pkt->udp.len); while (chksum >> 16) chksum = (chksum & 0xFFFF) + (chksum >> 16); return bpf_htons(~chksum); } static const struct udpopt { __u32 num_pkts; struct iface iface; struct mac_addr dst_mac; struct mac_addr src_mac; struct ip_addr dst_ip; struct ip_addr src_ip; __u16 dst_port; __u16 src_port; __u16 dyn_ports; __u16 threads; __u16 interval; __u16 pkt_size; __u8 hop_limit; } defaults_udp = { .interval = 1, .threads = 1, .pkt_size = 64, .hop_limit = 1, }; static struct udp_packet *prepare_udp_pkt(const struct udpopt *cfg) { struct mac_addr src_mac = cfg->src_mac; struct udp_packet *pkt = NULL; __u16 payload_len; int err; if (macaddr_is_null(&src_mac)) { err = get_mac_addr(cfg->iface.ifindex, &src_mac); if (err) goto err; } if (cfg->pkt_size < sizeof(*pkt)) { pr_warn("Minimum packet size is %zu bytes\n", sizeof(*pkt)); goto err; } pkt = calloc(1, cfg->pkt_size); if (!pkt) goto err; memcpy(pkt, &pkt_udp, sizeof(*pkt)); payload_len = cfg->pkt_size - offsetof(struct udp_packet, udp); pkt->iph.payload_len = bpf_htons(payload_len); pkt->iph.hop_limit = cfg->hop_limit; pkt->udp.len = bpf_htons(payload_len); memcpy(pkt->eth.h_source, &src_mac, sizeof(src_mac)); if (!macaddr_is_null(&cfg->dst_mac)) memcpy(pkt->eth.h_dest, &cfg->dst_mac, sizeof(cfg->dst_mac)); if (!ipaddr_is_null(&cfg->src_ip)) { if (cfg->src_ip.af != AF_INET6) { pr_warn("Only IPv6 is supported\n"); goto err; } pkt->iph.saddr = cfg->src_ip.addr.addr6; } if (!ipaddr_is_null(&cfg->dst_ip)) { if (cfg->dst_ip.af != AF_INET6) { pr_warn("Only IPv6 is supported\n"); goto err; } pkt->iph.daddr = cfg->dst_ip.addr.addr6; } if (cfg->src_port) pkt->udp.source = bpf_htons(cfg->src_port); if (cfg->dst_port) pkt->udp.dest = bpf_htons(cfg->dst_port); pkt->udp.check = calc_udp_cksum(pkt); return pkt; err: free(pkt); return NULL; } static struct prog_option udp_options[] = { DEFINE_OPTION("dst-mac", OPT_MACADDR, struct udpopt, dst_mac, .short_opt = 'm', .metavar = "", .help = "Destination MAC address of generated packets"), DEFINE_OPTION("src-mac", OPT_MACADDR, struct udpopt, src_mac, .short_opt = 'M', .metavar = "", .help = "Source MAC address of generated packets"), DEFINE_OPTION("dst-addr", OPT_IPADDR, struct udpopt, dst_ip, .short_opt = 'a', .metavar = "", .help = "Destination IP address of generated packets"), DEFINE_OPTION("src-addr", OPT_IPADDR, struct udpopt, src_ip, .short_opt = 'A', .metavar = "", .help = "Source IP address of generated packets"), DEFINE_OPTION("dst-port", OPT_U16, struct udpopt, dst_port, .short_opt = 'p', .metavar = "", .help = "Destination port of generated packets"), DEFINE_OPTION("src-port", OPT_U16, struct udpopt, src_port, .short_opt = 'P', .metavar = "", .help = "Source port of generated packets"), DEFINE_OPTION("dyn-ports", OPT_U16, struct udpopt, dyn_ports, .short_opt = 'd', .metavar = "", .help = "Dynamically vary destination port over a range of "), DEFINE_OPTION("num-packets", OPT_U32, struct udpopt, num_pkts, .short_opt = 'n', .metavar = "", .help = "Number of packets to send"), DEFINE_OPTION("pkt-size", OPT_U16, struct udpopt, pkt_size, .short_opt = 's', .metavar = "", .help = "Packet size. Default 64."), DEFINE_OPTION("threads", OPT_U16, struct udpopt, threads, .short_opt = 't', .metavar = "", .help = "Number of simultaneous threads to transmit from"), DEFINE_OPTION("interval", OPT_U16, struct udpopt, interval, .short_opt = 'I', .metavar = "", .help = "Output statistics with this interval"), DEFINE_OPTION("hop-limit", OPT_U8, struct udpopt, hop_limit, .short_opt = 'l', .metavar = "", .help = "Hop limit to set in the IP header. Default 1."), DEFINE_OPTION("interface", OPT_IFNAME, struct udpopt, iface, .positional = true, .metavar = "", .required = true, .help = "Load on device "), END_OPTIONS }; int do_udp(const void *opt, __unused const char *pin_root_path) { struct xdp_program *prog = NULL, *pass_prog = NULL; const struct udpopt *cfg = opt; DECLARE_LIBXDP_OPTS(xdp_program_opts, opts); struct thread_config *t = NULL, tcfg = { .pkt_size = cfg->pkt_size, .num_pkts = cfg->num_pkts, }; struct trafficgen_state bpf_state = {}; struct xdp_trafficgen *skel = NULL; struct udp_packet *payload = NULL; pthread_t *runner_threads = NULL; int err = 0, i; char buf[100]; __u32 key = 0; err = probe_kernel_support(); if (err) return err; payload = prepare_udp_pkt(cfg); if (!payload) { err = -ENOMEM; goto out; } tcfg.pkt = payload; skel = xdp_trafficgen__open(); if (!skel) { err = -errno; pr_warn("Couldn't open XDP program: %s\n", strerror(-err)); goto out; } err = sample_init_pre_load(skel, cfg->iface.ifname); if (err < 0) { pr_warn("Failed to sample_init_pre_load: %s\n", strerror(-err)); goto out; } skel->rodata->config.port_start = cfg->dst_port; skel->rodata->config.port_range = cfg->dyn_ports; skel->rodata->config.ifindex_out = cfg->iface.ifindex; bpf_state.next_port = cfg->dst_port; if (cfg->dyn_ports) opts.prog_name = "xdp_redirect_update_port"; else opts.prog_name = "xdp_redirect_notouch"; opts.obj = skel->obj; prog = xdp_program__create(&opts); if (!prog) { err = -errno; libxdp_strerror(err, buf, sizeof(buf)); pr_warn("Couldn't open BPF file: %s\n", buf); goto out; } if (driver_needs_xdp_pass(&cfg->iface)) { DECLARE_LIBXDP_OPTS(xdp_program_opts, pass_opts); pass_opts.prog_name = "xdp_pass"; pass_opts.find_filename = "xdp-dispatcher.o"; pass_prog = xdp_program__create(&pass_opts); if (!pass_prog) { err = -errno; pr_warn("Couldn't load xdp_pass program\n"); goto out; } xdp_program__set_xdp_frags_support(pass_prog, true); } err = xdp_trafficgen__load(skel); if (err) goto out; if (pass_prog) { err = xdp_program__attach(pass_prog, cfg->iface.ifindex, XDP_MODE_NATIVE, 0); if (err) { pr_warn("Couldn't attach xdp_pass program\n"); xdp_program__close(pass_prog); pass_prog = NULL; goto out; } } err = check_iface_support(&cfg->iface); if (err) goto out; err = bpf_map_update_elem(bpf_map__fd(skel->maps.state_map), &key, &bpf_state, BPF_EXIST); if (err) { err = -errno; pr_warn("Couldn't set initial state map value: %s\n", strerror(-err)); goto out; } err = sample_init(skel, mask, IFINDEX_LO, cfg->iface.ifindex); if (err < 0) { pr_warn("Failed to initialize sample: %s\n", strerror(-err)); goto out; } err = create_runners(&runner_threads, &t, cfg->threads, &tcfg, prog); if (err) goto out; pr_info("Transmitting on %s (ifindex %d)\n", cfg->iface.ifname, cfg->iface.ifindex); err = sample_run(cfg->interval, NULL, NULL); status_exited = true; for (i = 0; i < cfg->threads; i++) { pthread_join(runner_threads[i], NULL); xdp_program__close(t[i].prog); } out: if (pass_prog) { xdp_program__detach(pass_prog, cfg->iface.ifindex, XDP_MODE_NATIVE, 0); xdp_program__close(pass_prog); } xdp_program__close(prog); xdp_trafficgen__destroy(skel); free(runner_threads); free(payload); free(t); return err; } const struct xsk_opts defaults_xsk_udp = { .attach_mode = XDP_MODE_NATIVE, .interval = 1, .retries = 3, .frame_size = 4096, .batch_size = 64, .tx_pkt_size = 64, .sched_policy = XSK_SCHED_OTHER, .clock = XSK_CLOCK_MONOTONIC, .vlan_id = 1, .vlan_pri = 0, }; struct enum_val xsk_program_modes[] = { {"rxdrop", XSK_RXDROP}, {"swap-macs", XSK_SWAP_MACS}, {NULL, 0} }; struct enum_val xsk_copy_modes[] = { {"auto", XSK_COPY_AUTO}, {"copy", XSK_COPY_COPY}, {"zero-copy", XSK_COPY_ZEROCOPY}, {NULL, 0} }; struct enum_val xsk_clocks[] = { {"MONOTONIC", XSK_CLOCK_MONOTONIC}, {"REALTIME", XSK_CLOCK_REALTIME}, {"TAI", XSK_CLOCK_TAI}, {"BOOTTIME", XSK_CLOCK_BOOTTIME}, {NULL, 0} }; struct enum_val xsk_sched_policies[] = { {"SCHED_OTHER", XSK_SCHED_OTHER}, {"SCHED_FIFO", XSK_SCHED_FIFO}, {NULL, 0} }; struct prog_option xsk_udp_options[] = { DEFINE_OPTION("dst-mac", OPT_MACADDR, struct xsk_opts, dst_mac, .short_opt = 'm', .metavar = "", .help = "Destination MAC address of generated packets"), DEFINE_OPTION("src-mac", OPT_MACADDR, struct xsk_opts, src_mac, .short_opt = 'M', .metavar = "", .help = "Source MAC address of generated packets"), DEFINE_OPTION("timestamp", OPT_BOOL, struct xsk_opts, timestamp, .short_opt = 'y', .help = "Add timestamp to packets"), DEFINE_OPTION("vlan-tag", OPT_BOOL, struct xsk_opts, vlan_tag, .short_opt = 'V', .help = "Add vlan tag to packets"), DEFINE_OPTION("vlan-id", OPT_U16, struct xsk_opts, vlan_id, .short_opt = 'J', .metavar = "", .help = "VLAN ID to insert into VLAN tag (with -V). Default 1."), DEFINE_OPTION("vlan-pri", OPT_U16, struct xsk_opts, vlan_pri, .short_opt = 'K', .metavar = "", .help = "VLAN PRI to insert into VLAN tag (with -V). Default 0"), DEFINE_OPTION("fill-pattern", OPT_U32, struct xsk_opts, pkt_fill_pattern, .short_opt = 'P', .metavar = "", .hex = true, .help = "Fill pattern (u32 hex value)"), DEFINE_OPTION("tx-cycle-time", OPT_U64, struct xsk_opts, tx_cycle_us, .short_opt = 'T', .metavar = "", .help = "TX cycle time (usec)."), DEFINE_OPTION("queue", OPT_U32, struct xsk_opts, queue_idx, .short_opt = 'q', .metavar = "", .help = "Queue index to use (default 0)"), DEFINE_OPTION("interval", OPT_U32, struct xsk_opts, interval, .short_opt = 'i', .metavar = "", .help = "Statistics update interval (default 1)"), DEFINE_OPTION("retries", OPT_U32, struct xsk_opts, retries, .short_opt = 'O', .metavar = "", .help = "Number of time-out retries per 1s interval (default 3)"), DEFINE_OPTION("frame-size", OPT_U32, struct xsk_opts, frame_size, .short_opt = 'f', .metavar = "", .help = "Data frame size (must be a power of two in aligned mode); default 4096"), DEFINE_OPTION("pkt-size", OPT_U16, struct xsk_opts, tx_pkt_size, .short_opt = 's', .metavar = "", .help = "Packet size of transmitted packets; default 64"), DEFINE_OPTION("duration", OPT_U32, struct xsk_opts, duration, .short_opt = 'd', .metavar = "", .help = "Duration to run; default 0 (forever)"), DEFINE_OPTION("pkt-count", OPT_U32, struct xsk_opts, pkt_count, .short_opt = 'c', .metavar = "", .help = "Number of packets to send before exiting; default 0 (forever)"), DEFINE_OPTION("batch-size", OPT_U32, struct xsk_opts, batch_size, .short_opt = 'b', .metavar = "", .help = "Batch size for receive loop; default 64"), DEFINE_OPTION("irq-string", OPT_STRING, struct xsk_opts, irq_string, .short_opt = 'I', .metavar = "", .help = "Display driver interrupt statistics for interface associated with "), DEFINE_OPTION("poll", OPT_BOOL, struct xsk_opts, use_poll, .short_opt = 'p', .help = "Use poll syscall"), DEFINE_OPTION("no-need-wakeup", OPT_BOOL, struct xsk_opts, no_need_wakeup, .help = "Turn off use of driver need wakeup flag"), DEFINE_OPTION("unaligned", OPT_BOOL, struct xsk_opts, unaligned, .short_opt = 'u', .help = "Enable unaligned chunk placement"), DEFINE_OPTION("shared-umem", OPT_BOOL, struct xsk_opts, shared_umem, .help = "Enable XDP_SHARED_UMEM across multiple sockets"), DEFINE_OPTION("extra-stats", OPT_BOOL, struct xsk_opts, extra_stats, .short_opt = 'x', .help = "Display extra statistics"), DEFINE_OPTION("quiet", OPT_BOOL, struct xsk_opts, quiet, .short_opt = 'Q', .help = "Do not display any statistics"), DEFINE_OPTION("app-stats", OPT_BOOL, struct xsk_opts, app_stats, .short_opt = 'a', .help = "Display application (syscall) statistics"), DEFINE_OPTION("copy_mode", OPT_ENUM, struct xsk_opts, copy_mode, .short_opt = 'C', .typearg = xsk_copy_modes, .metavar = "", .help = "Use for copying data packets to userspace; default auto"), DEFINE_OPTION("clock", OPT_ENUM, struct xsk_opts, clock, .short_opt = 'w', .typearg = xsk_clocks, .metavar = "", .help = "Clock name to use; default MONOTONIC"), DEFINE_OPTION("policy", OPT_ENUM, struct xsk_opts, sched_policy, .short_opt = 'W', .typearg = xsk_sched_policies, .metavar = "", .help = "Scheduler policy; default SCHED_OTHER"), DEFINE_OPTION("schpri", OPT_U32, struct xsk_opts, sched_prio, .short_opt = 'U', .metavar = "", .help = "Scheduler priority; default 0"), DEFINE_OPTION("attach-mode", OPT_ENUM, struct xsk_opts, attach_mode, .short_opt = 'A', .typearg = xdp_modes, .metavar = "", .help = "Load XDP program in ; default native"), DEFINE_OPTION("dev", OPT_IFNAME, struct xsk_opts, iface, .positional = true, .metavar = "", .required = true, .help = "Load on device "), END_OPTIONS }; static int do_xsk_udp(const void *cfg, __unused const char *pin_root_path) { const struct xsk_opts *opt = cfg; struct xsk_ctx *ctx; pthread_t pt; int ret; ret = xsk_validate_opts(opt); if (ret) return ret; ctx = xsk_ctx__create(opt, XSK_BENCH_TXONLY); ret = libxdp_get_error(ctx); if (ret) return ret; pr_info("Transmitting on %s (ifindex %d)\n", opt->iface.ifname, opt->iface.ifindex); ret = xsk_start_bench(ctx, &pt); if (ret) goto out; ret = xsk_stats_poller(ctx); pthread_join(pt, NULL); out: xsk_ctx__destroy(ctx); return ret; } struct tcp_packet { struct ethhdr eth; struct ipv6hdr iph; struct tcphdr tcp; __u8 payload[1500 - sizeof(struct tcphdr) - sizeof(struct ethhdr) - sizeof(struct ipv6hdr)]; } __attribute__((__packed__)); static __unused struct tcp_packet pkt_tcp = { .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), .iph.version = 6, .iph.nexthdr = IPPROTO_TCP, .iph.payload_len = bpf_htons(sizeof(struct tcp_packet) - offsetof(struct tcp_packet, tcp)), .iph.hop_limit = 64, .iph.saddr.s6_addr16 = {bpf_htons(0xfe80), 0, 0, 0, 0, 0, 0, bpf_htons(1)}, .iph.daddr.s6_addr16 = {bpf_htons(0xfe80), 0, 0, 0, 0, 0, 0, bpf_htons(2)}, .tcp.source = bpf_htons(1), .tcp.dest = bpf_htons(1), .tcp.window = bpf_htons(0x100), .tcp.doff = 5, .tcp.ack = 1, }; static void hexdump_data(void *data, int size) { unsigned char *ptr = data; int i; for (i = 0; i < size; i++) { if (i % 16 == 0) pr_debug("\n%06X: ", i); else if (i % 2 == 0) pr_debug(" "); pr_debug("%02X", *ptr++); } pr_debug("\n"); } static __be16 calc_tcp_cksum(const struct tcp_packet *pkt) { __u32 chksum = bpf_htons(pkt->iph.nexthdr) + pkt->iph.payload_len; int payload_len = sizeof(pkt->payload); struct tcphdr tcph_ = pkt->tcp; __u16 *ptr = (void *)&tcph_; int i; tcph_.check = 0; for (i = 0; i < 8; i++) { chksum += pkt->iph.saddr.s6_addr16[i]; chksum += pkt->iph.daddr.s6_addr16[i]; } for (i = 0; i < 10; i++) chksum += *(ptr++); ptr = (void *)&pkt->payload; for (i = 0; i < payload_len / 2; i++) chksum += *(ptr++); if (payload_len % 2) chksum += (*((__u8 *)ptr)) << 8; while (chksum >> 16) chksum = (chksum & 0xFFFF) + (chksum >> 16); return ~chksum; } static void prepare_tcp_pkt(const struct tcp_flowkey *fkey, const struct tcp_flowstate *fstate) { memcpy(pkt_tcp.eth.h_source, fstate->src_mac, ETH_ALEN); memcpy(pkt_tcp.eth.h_dest, fstate->dst_mac, ETH_ALEN); pkt_tcp.iph.saddr = fkey->src_ip; pkt_tcp.iph.daddr = fkey->dst_ip; pkt_tcp.tcp.source = fkey->src_port; pkt_tcp.tcp.dest = fkey->dst_port; pkt_tcp.tcp.seq = bpf_htonl(fstate->seq); pkt_tcp.tcp.ack_seq = bpf_htonl(fstate->rcv_seq); pkt_tcp.tcp.check = calc_tcp_cksum(&pkt_tcp); pr_debug("TCP packet:\n"); hexdump_data(&pkt_tcp, sizeof(pkt_tcp)); } static const struct tcpopt { __u32 num_pkts; struct iface iface; char *dst_addr; __u16 dst_port; __u16 interval; __u16 timeout; enum xdp_attach_mode mode; } defaults_tcp = { .interval = 1, .dst_port = 10000, .timeout = 2, .mode = XDP_MODE_NATIVE, }; static struct prog_option tcp_options[] = { DEFINE_OPTION("dst-port", OPT_U16, struct tcpopt, dst_port, .short_opt = 'p', .metavar = "", .help = "Connect to destination . Default 10000"), DEFINE_OPTION("num-packets", OPT_U32, struct tcpopt, num_pkts, .short_opt = 'n', .metavar = "", .help = "Number of packets to send"), DEFINE_OPTION("interval", OPT_U16, struct tcpopt, interval, .short_opt = 'I', .metavar = "", .help = "Output statistics with this interval"), DEFINE_OPTION("timeout", OPT_U16, struct tcpopt, timeout, .short_opt = 't', .metavar = "", .help = "TCP connect timeout (default 2 seconds)."), DEFINE_OPTION("interface", OPT_IFNAME, struct tcpopt, iface, .metavar = "", .required = true, .short_opt = 'i', .help = "Connect through device "), DEFINE_OPTION("mode", OPT_ENUM, struct tcpopt, mode, .short_opt = 'm', .typearg = xdp_modes, .metavar = "", .help = "Load ingress XDP program in ; default native"), DEFINE_OPTION("dst-addr", OPT_STRING, struct tcpopt, dst_addr, .positional = true, .required = true, .metavar = "", .help = "Destination host of generated stream"), END_OPTIONS }; int do_tcp(const void *opt, __unused const char *pin_root_path) { const struct tcpopt *cfg = opt; struct addrinfo *ai = NULL, hints = { .ai_family = AF_INET6, .ai_socktype = SOCK_STREAM, .ai_protocol = IPPROTO_TCP, }; struct ip_addr local_addr = { .af = AF_INET6 }, remote_addr = { .af = AF_INET6 }; struct bpf_map *state_map = NULL, *fstate_map; DECLARE_LIBXDP_OPTS(xdp_program_opts, opts, .prog_name = "xdp_handle_tcp_recv"); struct xdp_program *ifindex_prog = NULL, *test_prog = NULL; struct sockaddr_in6 local_saddr = {}, *addr6; struct thread_config *t = NULL, tcfg = { .pkt = &pkt_tcp, .pkt_size = sizeof(pkt_tcp), .num_pkts = cfg->num_pkts, }; struct trafficgen_state bpf_state = {}; struct xdp_trafficgen *skel = NULL; char buf_local[50], buf_remote[50]; pthread_t *runner_threads = NULL; socklen_t sockaddr_sz, tcpi_sz; __u16 local_port, remote_port; int sock = -1, err = -EINVAL; struct tcp_flowstate fstate; struct timeval timeout = { .tv_sec = cfg->timeout, }; struct tcp_info tcpi = {}; bool attached = false; __u16 num_threads = 1; __u32 key = 0; char port[6]; int i, sopt; err = probe_kernel_support(); if (err) return err; skel = xdp_trafficgen__open(); if (!skel) { err = -errno; pr_warn("Couldn't open XDP program: %s\n", strerror(-err)); goto out; } err = sample_init_pre_load(skel, cfg->iface.ifname); if (err < 0) { pr_warn("Failed to sample_init_pre_load: %s\n", strerror(-err)); goto out; } opts.obj = skel->obj; skel->rodata->config.ifindex_out = cfg->iface.ifindex; snprintf(port, sizeof(port), "%d", cfg->dst_port); err = getaddrinfo(cfg->dst_addr, port, &hints, &ai); if (err) { pr_warn("Couldn't resolve hostname: %s\n", gai_strerror(err)); goto out; } addr6 = (struct sockaddr_in6* )ai->ai_addr; remote_addr.addr.addr6 = addr6->sin6_addr; remote_port = bpf_ntohs(addr6->sin6_port); bpf_state.flow_key.dst_port = addr6->sin6_port; bpf_state.flow_key.dst_ip = addr6->sin6_addr; print_addr(buf_remote, sizeof(buf_remote), &remote_addr); ifindex_prog = xdp_program__create(&opts); if (!ifindex_prog) { err = -errno; pr_warn("Couldn't open XDP program: %s\n", strerror(-err)); goto out; } xdp_program__set_xdp_frags_support(ifindex_prog, true); opts.prog_name = "xdp_redirect_send_tcp"; test_prog = xdp_program__create(&opts); if (!test_prog) { err = -errno; pr_warn("Couldn't find test program: %s\n", strerror(-err)); goto out; } state_map = skel->maps.state_map; fstate_map = skel->maps.flow_state_map; if (!fstate_map) { pr_warn("Couldn't find BPF maps\n"); goto out; } err = xdp_program__attach(ifindex_prog, cfg->iface.ifindex, cfg->mode, 0); if (err) { err = -errno; pr_warn("Couldn't attach XDP program to iface '%s': %s\n", cfg->iface.ifname, strerror(-err)); goto out; } attached = true; err = check_iface_support(&cfg->iface); if (err) goto out; err = bpf_map_update_elem(bpf_map__fd(state_map), &key, &bpf_state, BPF_EXIST); if (err) { err = -errno; pr_warn("Couldn't set initial state map value: %s\n", strerror(-err)); goto out; } err = sample_init(skel, mask, IFINDEX_LO, cfg->iface.ifindex); if (err < 0) { pr_warn("Failed to initialize sample: %s\n", strerror(-err)); goto out; } sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); if (sock < 0) { err = -errno; pr_warn("Couldn't open TCP socket: %s\n", strerror(-err)); goto out; } err = setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX, &cfg->iface.ifindex, sizeof(cfg->iface.ifindex)); if (err) { err = -errno; pr_warn("Couldn't bind to device '%s': %s\n", cfg->iface.ifname, strerror(-err)); goto out; } sopt = fcntl(sock, F_GETFL, NULL); if (sopt < 0) { err = -errno; pr_warn("Couldn't get socket opts: %s\n", strerror(-err)); goto out; } err = fcntl(sock, F_SETFL, sopt | O_NONBLOCK); if (err) { err = -errno; pr_warn("Couldn't set socket non-blocking: %s\n", strerror(-err)); goto out; } err = connect(sock, ai->ai_addr, ai->ai_addrlen); if (err && errno == EINPROGRESS) { fd_set wait; FD_ZERO(&wait); FD_SET(sock, &wait); err = select(sock + 1, NULL, &wait, NULL, &timeout); if (!err) { err = -1; errno = ETIMEDOUT; } else if (err > 0) { err = 0; } } if (err) { err = -errno; pr_warn("Couldn't connect to destination: %s\n", strerror(-err)); goto out; } err = fcntl(sock, F_SETFL, sopt); if (err) { err = -errno; pr_warn("Couldn't reset socket opts: %s\n", strerror(-err)); goto out; } sockaddr_sz = sizeof(local_saddr); err = getsockname(sock, (struct sockaddr *)&local_saddr, &sockaddr_sz); if (err) { err = -errno; pr_warn("Couldn't get local address: %s\n", strerror(-err)); goto out; } local_addr.addr.addr6 = local_saddr.sin6_addr; local_port = bpf_htons(local_saddr.sin6_port); print_addr(buf_local, sizeof(buf_local), &local_addr); printf("Connected to %s port %d from %s port %d\n", buf_remote, remote_port, buf_local, local_port); bpf_state.flow_key.src_port = local_saddr.sin6_port; bpf_state.flow_key.src_ip = local_saddr.sin6_addr; tcpi_sz = sizeof(tcpi); err = getsockopt(sock, IPPROTO_TCP, TCP_INFO, &tcpi, &tcpi_sz); if (err) { err = -errno; pr_warn("Couldn't get TCP_INFO for socket: %s\n", strerror(-err)); goto out; } err = bpf_map_lookup_elem(bpf_map__fd(fstate_map), &bpf_state.flow_key, &fstate); if (err) { err = -errno; pr_warn("Couldn't find flow state in map: %s\n", strerror(-err)); goto out; } if (tcpi.tcpi_snd_wnd != fstate.window) { pr_warn("TCP_INFO and packet data disagree on window (%u != %u)\n", tcpi.tcpi_snd_wnd, fstate.window); } fstate.wscale = tcpi.tcpi_rcv_wscale; fstate.flow_state = FLOW_STATE_RUNNING; err = bpf_map_update_elem(bpf_map__fd(fstate_map), &bpf_state.flow_key, &fstate, BPF_EXIST); if (err) { err = -errno; pr_warn("Couldn't update flow state map: %s\n", strerror(-err)); goto out; } err = bpf_map_update_elem(bpf_map__fd(state_map), &key, &bpf_state, BPF_EXIST); if (err) { err = -errno; pr_warn("Couldn't update program state map: %s\n", strerror(-err)); goto out; } prepare_tcp_pkt(&bpf_state.flow_key, &fstate); err = create_runners(&runner_threads, &t, num_threads, &tcfg, test_prog); if (err) goto out; err = sample_run(cfg->interval, NULL, NULL); status_exited = true; for (i = 0; i < num_threads; i++) { pthread_join(runner_threads[i], NULL); xdp_program__close(t[i].prog); } /* send 3 RSTs with 200ms interval to kill the other side of the connection */ for (i = 0; i < 3; i++) { usleep(200000); pkt_tcp.tcp.rst = 1; pkt_tcp.iph.payload_len = bpf_htons(sizeof(struct tcphdr)); pkt_tcp.tcp.check = calc_tcp_cksum(&pkt_tcp); tcfg.cpu_core_id = 0; tcfg.num_pkts = 1; tcfg.pkt_size = offsetof(struct tcp_packet, payload); tcfg.prog = test_prog; run_traffic(&tcfg); } out: if (ai) freeaddrinfo(ai); if (sock >= 0) close(sock); if (attached) xdp_program__detach(ifindex_prog, cfg->iface.ifindex, cfg->mode, 0); xdp_program__close(ifindex_prog); xdp_program__close(test_prog); xdp_trafficgen__destroy(skel); free(runner_threads); free(t); return err; } static const struct probeopt { struct iface iface; } defaults_probe = {}; static struct prog_option probe_options[] = { DEFINE_OPTION("interface", OPT_IFNAME, struct probeopt, iface, .metavar = "", .short_opt = 'i', .help = "Probe features of device "), END_OPTIONS }; int do_probe(const void *opt, __unused const char *pin_root_path) { const struct probeopt *cfg = opt; int err1 = 0, err2; if (cfg->iface.ifindex) { err1 = check_iface_support(&cfg->iface); if (err1) { const char *name = get_driver_name(cfg->iface.ifindex); if (driver_needs_xdp_pass(&cfg->iface)) { pr_info(" Note that this driver (%s) needs an XDP program " "loaded to use XDP_REDIRECT.\n" " Loading a dummy XDP program on the interface " "may enable support.\n", name); } else { if (!strcmp(name, "veth")) pr_info(" Note that enabling GRO on both ends of a " "veth pair may enable XDP support\n"); } } } err2 = probe_kernel_support(); if (!err2) pr_info("Kernel supports live packet mode for XDP BPF_PROG_RUN.\n"); return !(!err1 && !err2); } int do_help(__unused const void *cfg, __unused const char *pin_root_path) { fprintf(stderr, "Usage: xdp-trafficgen COMMAND [options]\n" "\n" "COMMAND can be one of:\n" " udp - run in UDP mode\n" " xsk-udp - run in UDP mode (using AF_XDP sockets)\n" " tcp - run in TCP mode\n" " probe - probe kernel support\n" " help - show this help message\n" "\n" "Use 'xdp-trafficgen COMMAND --help' to see options for each command\n"); return -1; } static const struct prog_command cmds[] = { DEFINE_COMMAND(udp, "Run in UDP mode"), DEFINE_COMMAND_NAME("xsk-udp", xsk_udp, "Run in UDP mode (using AF_XDP sockets)"), DEFINE_COMMAND(tcp, "Run in TCP mode"), DEFINE_COMMAND(probe, "Probe kernel support"), { .name = "help", .func = do_help, .no_cfg = true }, END_COMMANDS }; union all_opts { struct udpopt udp; struct tcpopt tcp; struct xsk_opts xsk; }; int main(int argc, char **argv) { if (argc > 1) return dispatch_commands(argv[1], argc - 1, argv + 1, cmds, sizeof(union all_opts), PROG_NAME, false); return do_help(NULL, NULL); } xdp-tools-1.6.1/xdp-trafficgen/xdp-trafficgen.h000066400000000000000000000016621514310632100214240ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #ifndef XDP_TRAFFICGEN_H #define XDP_TRAFFICGEN_H #include #include #include struct tcp_flowkey { struct in6_addr src_ip; struct in6_addr dst_ip; __u16 dst_port; __u16 src_port; }; #define FLOW_STATE_NEW 1 #define FLOW_STATE_RUNNING 2 #define FLOW_STATE_DONE 3 struct tcp_flowstate { struct bpf_spin_lock lock; __u8 dst_mac[ETH_ALEN]; __u8 src_mac[ETH_ALEN]; __u64 last_progress; __u64 retransmits; __u32 flow_state; __u32 seq; /* our last sent seqno */ __u32 ack_seq; /* last seqno that got acked */ __u32 rcv_seq; /* receiver's seqno (our ACK seq) */ __u32 dupack; __u32 last_print; __u32 highest_seq; __u16 window; __u8 wscale; }; struct trafficgen_config { int ifindex_out; __u16 port_start; __u16 port_range; }; struct trafficgen_state { struct tcp_flowkey flow_key; __u16 next_port; }; #endif xdp-tools-1.6.1/xdp-trafficgen/xdp_trafficgen.bpf.c000066400000000000000000000210551514310632100222450ustar00rootroot00000000000000/* SPDX-License-Identifier: GPL-2.0 */ #define XDP_STATS_MAP_PINNING LIBBPF_PIN_NONE #include "xdp-trafficgen.h" #include #include #include #include #include #include #include #include #include #include #include #include #if defined(HAVE_LIBBPF_BPF_PROGRAM__FLAGS) && defined(DEBUG) /* We use the many-argument version of bpf_printk() for debugging, so only * enable it if we have the libbpf helper that selects the vprintf version. This * was introduced in libbpf 0.6.0, which is the same versionn as the * bpf_program__flags() method, so use that as an indicator since we don't * feature detect on the BPF helpers themselves. */ #define TCP_DEBUG #endif char _license[] SEC("license") = "GPL"; struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 1); __type(key, __u32); __type(value, struct trafficgen_state); } state_map SEC(".maps"); const volatile struct trafficgen_config config; static void update_checksum(__u16 *sum, __u32 diff) { /* We use the RFC 1071 method for incremental checksum updates * because that can be used directly with the 32-bit sequence * number difference (relying on folding for large differences) */ __u32 cksum = diff + (__u16)~bpf_ntohs(*sum); while (cksum > 0xffff) cksum = (cksum & 0xffff) + (cksum >> 16); *sum = bpf_htons(~cksum); } static __u16 csum_fold_helper(__u32 csum) { csum = (csum & 0xffff) + (csum >> 16); return ~((csum & 0xffff) + (csum >> 16)); } SEC("xdp") int xdp_redirect_notouch(struct xdp_md *ctx) { __u32 key = bpf_get_smp_processor_id();; struct datarec *rec; rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) return XDP_ABORTED; NO_TEAR_INC(rec->xdp_redirect); return bpf_redirect(config.ifindex_out, 0); } SEC("xdp") int xdp_redirect_update_port(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct trafficgen_state *state; __u16 cur_port, port_diff; int action = XDP_ABORTED; struct datarec *rec; struct udphdr *hdr; __u32 key = 0; hdr = data + (sizeof(struct ethhdr) + sizeof(struct ipv6hdr)); if (hdr + 1 > data_end) goto out; state = bpf_map_lookup_elem(&state_map, &key); if (!state) goto out; key = bpf_get_smp_processor_id(); rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) goto out; cur_port = bpf_ntohs(hdr->dest); port_diff = state->next_port - cur_port; if (port_diff) { update_checksum(&hdr->check, port_diff); hdr->dest = bpf_htons(state->next_port); } if (state->next_port++ >= config.port_start + config.port_range - 1) state->next_port = config.port_start; action = bpf_redirect(config.ifindex_out, 0); NO_TEAR_INC(rec->processed); out: return action; } SEC("xdp") int xdp_drop(struct xdp_md *ctx) { return XDP_DROP; } struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 1); __type(key, struct tcp_flowkey); __type(value, struct tcp_flowstate); } flow_state_map SEC(".maps"); static int cmp_ipaddr(struct in6_addr *a_, struct in6_addr *b_) { __u8 *a = (void *)a_, *b = (void *)b_; int i; for (i = 0; i < sizeof(struct in6_addr); i++) { if (*a > *b) return -1; if (*a < *b) return 1; a++; b++; } return 0; } static inline __u8 before(__u32 seq1, __u32 seq2) { return (__s32)(seq1 - seq2) < 0; } /* Fixed 2 second timeout */ #define TCP_RTO 2000000000UL SEC("xdp") int xdp_handle_tcp_recv(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; struct tcp_flowstate *fstate, new_fstate = {}; void *data = (void *)(long)ctx->data; struct hdr_cursor nh = { .pos = data }; struct trafficgen_state *state; struct tcp_flowkey key = {}; int eth_type, ip_type, err; struct ipv6hdr *ipv6hdr; struct tcphdr *tcphdr; int action = XDP_PASS; struct ethhdr *eth; __u8 new_match; __u32 ack_seq; int i; eth_type = parse_ethhdr(&nh, data_end, ð); if (eth_type != bpf_htons(ETH_P_IPV6)) goto out; ip_type = parse_ip6hdr(&nh, data_end, &ipv6hdr); if (ip_type != IPPROTO_TCP) goto out; if (parse_tcphdr(&nh, data_end, &tcphdr) < 0) goto out; state = bpf_map_lookup_elem(&state_map, &key); if (!state) goto out; /* swap dst and src for received packet */ key.dst_ip = ipv6hdr->saddr; key.dst_port = tcphdr->source; new_match = !cmp_ipaddr(&key.dst_ip, &state->flow_key.dst_ip) && key.dst_port == state->flow_key.dst_port; key.src_ip = ipv6hdr->daddr; key.src_port = tcphdr->dest; fstate = bpf_map_lookup_elem(&flow_state_map, &key); if (!fstate) { if (!new_match) goto out; new_fstate.flow_state = FLOW_STATE_NEW; new_fstate.seq = bpf_ntohl(tcphdr->ack_seq); for (i = 0; i < ETH_ALEN; i++) { new_fstate.dst_mac[i] = eth->h_source[i]; new_fstate.src_mac[i] = eth->h_dest[i]; } err = bpf_map_update_elem(&flow_state_map, &key, &new_fstate, BPF_NOEXIST); if (err) goto out; fstate = bpf_map_lookup_elem(&flow_state_map, &key); if (!fstate) goto out; } ack_seq = bpf_ntohl(tcphdr->ack_seq); #ifdef TCP_DEBUG bpf_printk("Got state seq %u ack_seq %u new %u seq %u new %u window %u\n", fstate->seq, fstate->ack_seq, ack_seq, fstate->rcv_seq, bpf_ntohl(tcphdr->seq), bpf_htons(tcphdr->window)); #endif bpf_spin_lock(&fstate->lock); if (fstate->ack_seq == ack_seq) fstate->dupack++; fstate->window = bpf_ntohs(tcphdr->window); fstate->ack_seq = ack_seq; fstate->rcv_seq = bpf_ntohl(tcphdr->seq); if (tcphdr->syn) fstate->rcv_seq++; if (tcphdr->fin || tcphdr->rst) fstate->flow_state = FLOW_STATE_DONE; /* If we've taken over the flow management, (after the handshake), drop * the packet */ if (fstate->flow_state >= FLOW_STATE_RUNNING) action = XDP_DROP; bpf_spin_unlock(&fstate->lock); out: return action; } SEC("xdp") int xdp_redirect_send_tcp(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; __u32 new_seq, ack_seq, window; struct trafficgen_state *state; struct tcp_flowstate *fstate; int action = XDP_ABORTED; struct ipv6hdr *ipv6hdr; struct tcphdr *tcphdr; struct datarec *rec; __u8 resend = 0; #ifdef TCP_DEBUG __u8 print = 0; #endif __u16 pkt_len; __u32 key = 0; __u64 now; ipv6hdr = data + sizeof(struct ethhdr); tcphdr = data + (sizeof(struct ethhdr) + sizeof(struct ipv6hdr)); if (tcphdr + 1 > data_end || ipv6hdr + 1 > data_end) goto ret; pkt_len = bpf_ntohs(ipv6hdr->payload_len) - sizeof(*tcphdr); state = bpf_map_lookup_elem(&state_map, &key); if (!state) goto ret; key = bpf_get_smp_processor_id(); rec = bpf_map_lookup_elem(&rx_cnt, &key); if (!rec) goto ret; fstate = bpf_map_lookup_elem(&flow_state_map, (const void *)&state->flow_key); if (!fstate) goto out; now = bpf_ktime_get_coarse_ns(); bpf_spin_lock(&fstate->lock); if (fstate->flow_state != FLOW_STATE_RUNNING) { action = XDP_DROP; bpf_spin_unlock(&fstate->lock); goto out; } /* reset sequence on packet loss */ if (fstate->dupack || (fstate->last_progress && now - fstate->last_progress > TCP_RTO)) { fstate->seq = fstate->ack_seq; fstate->dupack = 0; } new_seq = fstate->seq; ack_seq = fstate->ack_seq; window = fstate->window << fstate->wscale; #ifdef TCP_DEBUG if (fstate->last_print != fstate->seq) { fstate->last_print = fstate->seq; print = 1; } #endif if (!before(new_seq + pkt_len, ack_seq + window)) { /* We caught up to the end up the RWIN, spin until ACKs come * back opening up the window */ action = XDP_DROP; bpf_spin_unlock(&fstate->lock); #ifdef TCP_DEBUG if (print) bpf_printk("Dropping because %u isn't before %u (ack_seq %u wnd %u)", new_seq + pkt_len, ack_seq + window, ack_seq, window); #endif goto out; } if (!before(new_seq, fstate->highest_seq)) { fstate->highest_seq = new_seq; } else { resend = 1; fstate->retransmits++; } fstate->seq = new_seq + pkt_len; fstate->last_progress = now; bpf_spin_unlock(&fstate->lock); new_seq = bpf_htonl(new_seq); if (new_seq != tcphdr->seq) { __u32 csum; csum = bpf_csum_diff(&tcphdr->seq, sizeof(__u32), &new_seq, sizeof(new_seq), ~tcphdr->check); tcphdr->seq = new_seq; tcphdr->check = csum_fold_helper(csum); } action = bpf_redirect(config.ifindex_out, 0); out: /* record retransmissions as XDP_TX return codes until we get better stats */ if (resend) NO_TEAR_INC(rec->issue); if (action == XDP_REDIRECT) NO_TEAR_INC(rec->xdp_redirect); else NO_TEAR_INC(rec->dropped); ret: return action; } SEC("xdp") int xdp_pass(struct xdp_md *ctx) { return XDP_PASS; }