trunk/0000755000175000017500000000000011313644724011550 5ustar benoitbenoittrunk/net/0000755000175000017500000000000011313644724012336 5ustar benoitbenoittrunk/net/rds.h0000644000175000017500000000336011313644724013301 0ustar benoitbenoit/* * net/rds.h - user space interface for RDS * * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef __NET_RDS_H #define __NET_RDS_H #include "ib_rds.h" static inline int rds_rdma_id_sign(uint64_t id1, uint64_t id2) { int64_t diff = id1 - id2; return (diff < 0)? -1 : ((diff == 0)? 0 : 1); } #define rds_rdma_id_cmp(id1, cmp, id2) (rds_rdma_id_sign((id1), (id2)) cmp 0) #endif /* __NET_RDS_H */ trunk/net/ib_rds.h0000644000175000017500000001636511313644724013764 0ustar benoitbenoit/* * Copyright (c) 2008 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #ifndef IB_RDS_H #define IB_RDS_H #include /* These sparse annotated types shouldn't be in any user * visible header file. We should clean this up rather * than kludging around them. */ #ifndef __KERNEL__ #define __be16 u_int16_t #define __be32 u_int32_t #define __be64 u_int64_t #endif #define RDS_IB_ABI_VERSION 0x301 /* * setsockopt/getsockopt for SOL_RDS */ #define RDS_CANCEL_SENT_TO 1 #define RDS_GET_MR 2 #define RDS_FREE_MR 3 /* deprecated: RDS_BARRIER 4 */ #define RDS_RECVERR 5 #define RDS_CONG_MONITOR 6 /* * Control message types for SOL_RDS. * * CMSG_RDMA_ARGS (sendmsg) * Request a RDMA transfer to/from the specified * memory ranges. * The cmsg_data is a struct rds_rdma_args. * RDS_CMSG_RDMA_DEST (recvmsg, sendmsg) * Kernel informs application about intended * source/destination of a RDMA transfer * RDS_CMSG_RDMA_MAP (sendmsg) * Application asks kernel to map the given * memory range into a IB MR, and send the * R_Key along in an RDS extension header. * The cmsg_data is a struct rds_get_mr_args, * the same as for the GET_MR setsockopt. * RDS_CMSG_RDMA_STATUS (recvmsg) * Returns the status of a completed RDMA operation. */ #define RDS_CMSG_RDMA_ARGS 1 #define RDS_CMSG_RDMA_DEST 2 #define RDS_CMSG_RDMA_MAP 3 #define RDS_CMSG_RDMA_STATUS 4 #define RDS_CMSG_CONG_UPDATE 5 #define RDS_INFO_COUNTERS 10000 #define RDS_INFO_CONNECTIONS 10001 /* 10002 aka RDS_INFO_FLOWS is deprecated */ #define RDS_INFO_SEND_MESSAGES 10003 #define RDS_INFO_RETRANS_MESSAGES 10004 #define RDS_INFO_RECV_MESSAGES 10005 #define RDS_INFO_SOCKETS 10006 #define RDS_INFO_TCP_SOCKETS 10007 #define RDS_INFO_IB_CONNECTIONS 10008 struct rds_info_counter { u_int8_t name[32]; u_int64_t value; } __attribute__((packed)); #define RDS_INFO_CONNECTION_FLAG_SENDING 0x01 #define RDS_INFO_CONNECTION_FLAG_CONNECTING 0x02 #define RDS_INFO_CONNECTION_FLAG_CONNECTED 0x04 struct rds_info_connection { u_int64_t next_tx_seq; u_int64_t next_rx_seq; __be32 laddr; __be32 faddr; u_int8_t transport[15]; /* null term ascii */ u_int8_t flags; } __attribute__((packed)); struct rds_info_flow { __be32 laddr; __be32 faddr; u_int32_t bytes; __be16 lport; __be16 fport; } __attribute__((packed)); #define RDS_INFO_MESSAGE_FLAG_ACK 0x01 #define RDS_INFO_MESSAGE_FLAG_FAST_ACK 0x02 struct rds_info_message { u_int64_t seq; u_int32_t len; __be32 laddr; __be32 faddr; __be16 lport; __be16 fport; u_int8_t flags; } __attribute__((packed)); struct rds_info_socket { u_int32_t sndbuf; __be32 bound_addr; __be32 connected_addr; __be16 bound_port; __be16 connected_port; u_int32_t rcvbuf; uint64_t inum; } __attribute__((packed)); struct rds_info_socket_v1 { u_int32_t sndbuf; __be32 bound_addr; __be32 connected_addr; __be16 bound_port; __be16 connected_port; u_int32_t rcvbuf; } __attribute__((packed)); struct rds_info_tcp_socket { __be32 local_addr; __be16 local_port; __be32 peer_addr; __be16 peer_port; u_int64_t hdr_rem; u_int64_t data_rem; u_int32_t last_sent_nxt; u_int32_t last_expected_una; u_int32_t last_seen_una; } __attribute__((packed)); #define RDS_IB_GID_LEN 16 struct rds_info_ib_connection { __be32 src_addr; __be32 dst_addr; uint8_t src_gid[RDS_IB_GID_LEN]; uint8_t dst_gid[RDS_IB_GID_LEN]; uint32_t max_send_wr; uint32_t max_recv_wr; uint32_t max_send_sge; uint32_t rdma_fmr_max; uint32_t rdma_fmr_size; }; /* * Congestion monitoring. * Congestion control in RDS happens at the host connection * level by exchanging a bitmap marking congested ports. * By default, a process sleeping in poll() is always woken * up when the congestion map is updated. * With explicit monitoring, an application can have more * fine-grained control. * The application installs a 64bit mask value in the socket, * where each bit corresponds to a group of ports. * When a congestion update arrives, RDS checks the set of * ports that are now uncongested against the list bit mask * installed in the socket, and if they overlap, we queue a * cong_notification on the socket. * * To install the congestion monitor bitmask, use RDS_CONG_MONITOR * with the 64bit mask. * Congestion updates are received via RDS_CMSG_CONG_UPDATE * control messages. * * The correspondence between bits and ports is * 1 << (portnum % 64) */ #define RDS_CONG_MONITOR_SIZE 64 #define RDS_CONG_MONITOR_BIT(port) (((unsigned int) port) % RDS_CONG_MONITOR_SIZE) #define RDS_CONG_MONITOR_MASK(port) (1ULL << RDS_CONG_MONITOR_BIT(port)) /* * RDMA related types */ /* * This encapsulates a remote memory location. * In the current implementation, it contains the R_Key * of the remote memory region, and the offset into it * (so that the application does not have to worry about * alignment). */ typedef u_int64_t rds_rdma_cookie_t; struct rds_iovec { u_int64_t addr; u_int64_t bytes; }; struct rds_get_mr_args { struct rds_iovec vec; u_int64_t cookie_addr; uint64_t flags; }; struct rds_free_mr_args { rds_rdma_cookie_t cookie; u_int64_t flags; }; struct rds_rdma_args { rds_rdma_cookie_t cookie; struct rds_iovec remote_vec; u_int64_t local_vec_addr; u_int64_t nr_local; u_int64_t flags; u_int64_t user_token; }; struct rds_rdma_notify { u_int64_t user_token; int32_t status; }; #define RDS_RDMA_SUCCESS 0 #define RDS_RDMA_REMOTE_ERROR 1 #define RDS_RDMA_CANCELED 2 #define RDS_RDMA_DROPPED 3 #define RDS_RDMA_OTHER_ERROR 4 /* * Common set of flags for all RDMA related structs */ #define RDS_RDMA_READWRITE 0x0001 #define RDS_RDMA_FENCE 0x0002 /* use FENCE for immediate send */ #define RDS_RDMA_INVALIDATE 0x0004 /* invalidate R_Key after freeing MR */ #define RDS_RDMA_USE_ONCE 0x0008 /* free MR after use */ #define RDS_RDMA_DONTWAIT 0x0010 /* Don't wait in SET_BARRIER */ #define RDS_RDMA_NOTIFY_ME 0x0020 /* Notify when operation completes */ #endif /* IB_RDS_H */ trunk/docs/0000755000175000017500000000000011313644724012500 5ustar benoitbenoittrunk/docs/rds-architecture.txt0000644000175000017500000003253711313644724016523 0ustar benoitbenoit Overview ======== This readme tries to provide some background on the hows and whys of RDS, and will hopefully help you find your way around the code. In addition, please see this email about RDS origins: http://oss.oracle.com/pipermail/rds-devel/2007-November/000228.html RDS Architecture ================ RDS provides reliable, ordered datagram delivery by using a single reliable connection between any two nodes in the cluster. This allows applications to use a single socket to talk to any other process in the cluster - so in a cluster with N processes you need N sockets, in contrast to N*N if you use a connection-oriented socket transport like TCP. RDS is not Infiniband-specific; it was designed to support different transports. The current implementation used to support RDS over TCP as well as IB. Work is in progress to support RDS over iWARP, and using DCE to guarantee no dropped packets on Ethernet, it may be possible to use RDS over UDP in the future. The high-level semantics of RDS from the application's point of view are * Addressing RDS uses IPv4 addresses and 16bit port numbers to identify the end point of a connection. All socket operations that involve passing addresses between kernel and user space generally use a struct sockaddr_in. The fact that IPv4 addresses are used does not mean the underlying transport has to be IP-based. In fact, RDS over IB uses a reliable IB connection; the IP address is used exclusively to locate the remote node's GID (by ARPing for the given IP). The port space is entirely independent of UDP, TCP or any other protocol. * Socket interface RDS sockets work *mostly* as you would expect from a BSD socket. The next section will cover the details. At any rate, all I/O is performed through the standard BSD socket API. Some additions like zerocopy support are implemented through control messages, while other extensions use the getsockopt/ setsockopt calls. Sockets must be bound before you can send or receive data. This is needed because binding also selects a transport and attaches it to the socket. Once bound, the transport assignment does not change. RDS will tolerate IPs moving around (eg in a active-active HA scenario), but only as long as the address doesn't move to a different transport. * sysctls RDS supports a number of sysctls in /proc/sys/net/rds Socket Interface ================ AF_RDS, PF_RDS, SOL_RDS These constants haven't been assigned yet, because RDS isn't in mainline yet. Currently, the kernel module assigns some constant and publishes it to user space through two sysctl files /proc/sys/net/rds/pf_rds /proc/sys/net/rds/sol_rds fd = socket(PF_RDS, SOCK_SEQPACKET, 0); This creates a new, unbound RDS socket. setsockopt(SOL_SOCKET): send and receive buffer size RDS honors the send and receive buffer size socket options. You are not allowed to queue more than SO_SNDSIZE bytes to a socket. A message is queued when sendmsg is called, and it leaves the queue when the remote system acknowledges its arrival. The SO_RCVSIZE option controls the maximum receive queue length. This is a soft limit rather than a hard limit - RDS will continue to accept and queue incoming messages, even if that takes the queue length over the limit. However, it will also mark the port as "congested" and send a congestion update to the source node. The source node is supposed to throttle any processes sending to this congested port. bind(fd, &sockaddr_in, ...) This binds the socket to a local IP address and port, and a transport. sendmsg(fd, ...) Sends a message to the indicated recipient. The kernel will transparently establish the underlying reliable connection if it isn't up yet. An attempt to send a message that exceeds SO_SNDSIZE will return with -EMSGSIZE An attempt to send a message that would take the total number of queued bytes over the SO_SNDSIZE threshold will return EAGAIN. An attempt to send a message to a destination that is marked as "congested" will return ENOBUFS. recvmsg(fd, ...) Receives a message that was queued to this socket. The sockets recv queue accounting is adjusted, and if the queue length drops below SO_SNDSIZE, the port is marked uncongested, and a congestion update is sent to all peers. Applications can ask the RDS kernel module to receive notifications via control messages (for instance, there is a notification when a congestion update arrived, or when a RDMA operation completes). These notifications are received through the msg.msg_control buffer of struct msghdr. The format of the messages is described in manpages. poll(fd) RDS supports the poll interface to allow the application to implement async I/O. POLLIN handling is pretty straightforward. When there's an incoming message queued to the socket, or a pending notification, we signal POLLIN. POLLOUT is a little harder. Since you can essentially send to any destination, RDS will always signal POLLOUT as long as there's room on the send queue (ie the number of bytes queued is less than the sendbuf size). However, the kernel will refuse to accept messages to a destination marked congested - in this case you will loop forever if you rely on poll to tell you what to do. This isn't a trivial problem, but applications can deal with this - by using congestion notifications, and by checking for ENOBUFS errors returned by sendmsg. setsockopt(SOL_RDS, RDS_CANCEL_SENT_TO, &sockaddr_in) This allows the application to discard all messages queued to a specific destination on this particular socket. This allows the application to cancel outstanding messages if it detects a timeout. For instance, if it tried to send a message, and the remote host is unreachable, RDS will keep trying forever. The application may decide it's not worth it, and cancel the operation. In this case, it would use RDS_CANCEL_SENT_TO to nuke any pending messages. RDMA for RDS ============ see rds-rdma(7) manpage (available in rds-tools) Congestion Notifications ======================== see rds(7) manpage RDS Protocol ============ Message header The message header is a 'struct rds_header' (see rds.h): Fields: h_sequence: per-packet sequence number h_ack: piggybacked acknowledgment of last packet received h_len: length of data, not including header h_sport: source port h_dport: destination port h_flags: CONG_BITMAP - this is a congestion update bitmap ACK_REQUIRED - receiver must ack this packet RETRANSMITTED - packet has previously been sent h_credit: indicate to other end of connection that it has more credits available (i.e. there is more send room) h_padding[4]: unused, for future use h_csum: header checksum h_exthdr: optional data can be passed here. This is currently used for passing RDMA-related information. ACK and retransmit handling One might think that with reliable IB connections you wouldn't need to ack messages that have been received. The problem is that IB hardware generates an ack message before it has DMAed the message into memory. This creates a potential message loss if the HCA is disabled for any reason between when it sends the ack and before the message is DMAed and processed. This is only a potential issue if another HCA is available for fail-over. Sending an ack immediately would allow the sender to free the sent message from their send queue quickly, but could cause excessive traffic to be used for acks. RDS piggybacks acks on sent data packets. Ack-only packets are reduced by only allowing one to be in flight at a time, and by the sender only asking for acks when its send buffers start to fill up. All retransmissions are also acked. Flow Control RDS's IB transport uses a credit-based mechanism to verify that there is space in the peer's receive buffers for more data. This eliminates the need for hardware retries on the connection. Congestion Messages waiting in the receive queue on the receiving socket are accounted against the sockets SO_RCVBUF option value. Only the payload bytes in the message are accounted for. If the number of bytes queued equals or exceeds rcvbuf then the socket is congested. All sends attempted to this socket's address should return block or return -EWOULDBLOCK. Applications are expected to be reasonably tuned such that this situation very rarely occurs. An application encountering this "back-pressure" is considered a bug. This is implemented by having each node maintain bitmaps which indicate which ports on bound addresses are congested. As the bitmap changes it is sent through all the connections which terminate in the local address of the bitmap which changed. The bitmaps are allocated as connections are brought up. This avoids allocation in the interrupt handling path which queues sages on sockets. The dense bitmaps let transports send the entire bitmap on any bitmap change reasonably efficiently. This is much easier to implement than some finer-grained communication of per-port congestion. The sender does a very inexpensive bit test to test if the port it's about to send to is congested or not. RDS Transport Layer ================== As mentioned above, RDS is not IB-specific. Its code is divided into a general RDS layer and a transport layer. The general layer handles the socket API, congestion handling, loopback, stats, usermem pinning, and the connection state machine. The transport layer handles the details of the transport. The IB transport, for example, handles all the queue pairs, work requests, CM event handlers, and other Infiniband details. RDS Kernel Structures ===================== struct rds_message aka possibly "rds_outgoing", the generic RDS layer copies data to be sent and sets header fields as needed, based on the socket API. This is then queued for the individual connection and sent by the connection's transport. struct rds_incoming a generic struct referring to incoming data that can be handed from the transport to the general code and queued by the general code while the socket is awoken. It is then passed back to the transport code to handle the actual copy-to-user. struct rds_socket per-socket information struct rds_connection per-connection information struct rds_transport pointers to transport-specific functions struct rds_statistics non-transport-specific statistics struct rds_cong_map wraps the raw congestion bitmap, contains rbnode, waitq, etc. Connection management ===================== Connections may be in UP, DOWN, CONNECTING, DISCONNECTING, and ERROR states. The first time an attempt is made by an RDS socket to send data to a node, a connection is allocated and connected. That connection is then maintained forever -- if there are transport errors, the connection will be dropped and re-established. Dropping a connection while packets are queued will cause queued or partially-sent datagrams to be retransmitted when the connection is re-established. The send path ============= rds_sendmsg() struct rds_message built from incoming data CMSGs parsed (e.g. RDMA ops) transport connection alloced and connected if not already rds_message placed on send queue send worker awoken rds_send_worker() calls rds_send_xmit() until queue is empty rds_send_xmit() transmits congestion map if one is pending may set ACK_REQUIRED calls transport to send either non-RDMA or RDMA message (RDMA ops never retransmitted) rds_ib_xmit() allocs work requests from send ring adds any new send credits available to peer (h_credits) maps the rds_message's sg list piggybacks ack populates work requests post send to connection's queue pair The recv path ============= rds_ib_recv_cq_comp_handler() looks at write completions unmaps recv buffer from device no errors, call rds_ib_process_recv() refill recv ring rds_ib_process_recv() validate header checksum copy header to rds_ib_incoming struct if start of a new datagram add to ibinc's fraglist if competed datagram: update cong map if datagram was cong update call rds_recv_incoming() otherwise note if ack is required rds_recv_incoming() drop duplicate packets respond to pings find the sock associated with this datagram add to sock queue wake up sock do some congestion calculations rds_recvmsg copy data into user iovec handle CMSGs return to application trunk/stap/0000755000175000017500000000000011313644724012517 5ustar benoitbenoittrunk/stap/rds.stp0000644000175000017500000000061411313644724014040 0ustar benoitbenoit/* probe module("rds").function("rds_*") { printf("RDS %s\n", pp()) } */ global reads probe begin { reads <<< 0 } probe module("rds").function("rds_recvmsg").return { reads <<< ret } probe module("rds").function("rds_send_pong").return { reads <<< ret //println(caller()) } probe timer.sec(5) { println("RDS bytes received") print(@hist_log(reads)) } probe end { printf("end!\n") } trunk/stap/README0000644000175000017500000000101111313644724013370 0ustar benoitbenoitSystemTap script for RDS SystemTap: http://sourceware.org/systemtap/ SystemTap wiki: http://sourceware.org/systemtap/wiki To use SystemTap for tracing RDS, please ensure you have debugging symbols available for both your installed kernel as well as RDS module. These usually take the form of *-debuginfo RPMs, and may be available via your distro's update repository, a distro repository disabled by default, or via your distro's website. Please send any comments or improvement patches to rds-devel@oss.oracle.com. trunk/Makefile0000644000175000017500000000501711313644724013213 0ustar benoitbenoitprefix = $(DESTDIR)/usr exec_prefix = $(DESTDIR)${prefix} bindir = $(DESTDIR)${exec_prefix}/bin mandir = $(DESTDIR)${prefix}/share/man incdir = $(DESTDIR)${prefix}/include all: all-programs CFLAGS = -O2 -Wall CPPFLAGS = -DDEBUG_EXE -MD -MP -MF $(@D)/.$(basename $(@F)).d HEADERS = kernel-list.h rdstool.h pfhack.h net/rds.h net/ib_rds.h COMMON_SOURCES = options.c stats.c pfhack.c SOURCES = $(addsuffix .c,$(PROGRAMS)) $(COMMON_SOURCES) CLEAN_OBJECTS = $(addsuffix .o,$(PROGRAMS)) $(subst .c,.o,$(COMMON_SOURCES)) # This is the default DYNAMIC_PF_RDS = true ifneq ($(DYNAMIC_PF_RDS),) CPPFLAGS += -DDYNAMIC_PF_RDS COMMON_OBJECTS = $(subst .c,.o,$(COMMON_SOURCES)) else COMMON_OBJECTS = $(subst .c,.o,$(filter-out pfhack.c,$(COMMON_SOURCES))) endif PROGRAMS = rds-gen rds-sink rds-info rds-stress rds-ping all-programs: $(PROGRAMS) install: $(PROGRAMS) install -d $(bindir) install -m 555 -s $(PROGRAMS) $(bindir) install -d $(mandir)/man1 install -d $(mandir)/man7 install -m 644 *.1 $(mandir)/man1 install -m 644 *.7 $(mandir)/man7 install -d $(incdir)/net install -m 444 net/rds.h $(incdir)/net clean: rm -f $(PROGRAMS) $(CLEAN_OBJECTS) distclean: clean rm -f .*.d $(PROGRAMS) : % : %.o $(COMMON_OBJECTS) gcc $(CFLAGS) $(LDFLAGS) -o $@ $^ LOCAL_DFILES := $(wildcard .*.d) ifneq ($(LOCAL_DFILES),) .PHONY: $(LOCAL_DFILES) -include $(LOCAL_DFILES) endif VERSION := 1.4 RELEASE := 1 TAR_PREFIX := rds-tools-$(VERSION)-$(RELEASE) TAR_FILE := $(TAR_PREFIX).tar.gz EXTRA_DIST := rds-info.1 \ rds-gen.1 \ rds-sink.1 \ rds-stress.1 \ rds-ping.1 \ rds.7 \ rds-rdma.7 \ Makefile.in \ rds-tools.spec.in \ configure.in \ configure \ README \ rds-tools.txt \ stap/rds.stp \ stap/README \ docs/rds-architecture.txt \ examples/Makefile \ examples/rds-sample.c \ examples/README DISTFILES := $(SOURCES) $(HEADERS) $(EXTRA_DIST) $(TAR_FILE): Makefile rds-tools.spec @rm -rf $@ $(TAR_PREFIX) || : @mkdir $(TAR_PREFIX) for a in $^ $(DISTFILES); do \ if [ ! -f $$a ]; then \ continue; \ fi; \ targ=$(TAR_PREFIX)/$$(dirname $$a); \ mkdir -p $$targ; \ cp $$a $$targ; \ done tar -zcf $@ $(TAR_PREFIX) .PHONY: rpm rpm: $(TAR_FILE) rpmbuild -ta $^ .PHONY: dist dist: $(TAR_FILE) trunk/rds-tools.txt0000644000175000017500000000201511313644724014235 0ustar benoitbenoit So, rds-get-stats is easy and I already have it done. we'd just import that. rds-gen would just send down a socket. I'm hoping for options like: -s addr:port to bind the source address -d addr:port dest to send to, maybe just round-robin between multiple to start? -m units the size of each sent message -b units the size of the socket buffer -5 include an md5sum at the tail of each message -f file read from a file until eof -p units send from a memory pool of the given length -S file put the -p pool in this mmaped/mlocked file, use sendfile -l units only send this many bytes total -i timespec output vmstat-like line at this interval I guess that gives us enough to chew on for now :) I want this stuff to be dirt simple. trivial arg parser helpers, maybe some list.h from the kernel, no glib complexity explosion. I guess I could send you some snippets of code along those lines. Oh, and I guess we'll need a little helper amongst the tools to get pf_rds and sol_rds from /proc/sys/net/rds/. - z trunk/rds.70000644000175000017500000003726011313644724012440 0ustar benoitbenoit.TH RDS 7 .SH NAME RDS \- Reliable Datagram Sockets .SH SYNOPSIS .nf .B #include .B #include .fi .SH DESCRIPTION This is an implementation of the RDS socket API. It provides reliable, in-order datagram delivery between sockets over a variety of transports. .PP Currently, RDS can be transported over Infiniband, and loopback. RDS over TCP is disabled, but will be re-enabled in the near future. .PP RDS uses standard .B AF_INET addresses as described in .BR ip (7) to identify end points. .\"------------------------------------------------------------------ .SS Socket Creation RDS is still in development and as such does not have a reserved protocol family constant. Applications must read the string representation of the protocol family value from the .B pf_rds sysctl parameter file described below. .PP .nf .B rds_socket = socket(pf_rds, SOCK_SEQPACKET, 0); .fi .PP .\"------------------------------------------------------------------ .SS Socket Options RDS sockets support a number of socket options through the .BR setsockopt (2) and .BR getsockopt (2) calls. The following generic options (with socket level .BR SOL_SOCKET ) are of specific importance: .TP .B SO_RCVBUF Specifies the size of the receive buffer. See section on "Congestion Control" below. .TP .B SO_SNDBUF Specifies the size of the send buffer. See "Message Transmission" below. .TP .B SO_SNDTIMEO Specifies the send timeout when trying to enqueue a message on a socket with a full queue in blocking mode. .PP In addition to these, RDS supports a number of protocol specific options (with socket level .BR SOL_RDS ). Just as with the RDS protocol family, an official value has not been assigned yet, so the kernel will assign a value dynamically. The assigned value can be retrieved from the .B sol_rds sysctl parameter file. .PP RDS specific socket options will be described in a separate section below. .\"------------------------------------------------------------------ .SS Binding A new RDS socket has no local address when it is first returned from .BR socket (2). It must be bound to a local address by calling .BR bind (2) before any messages can be sent or received. This will also attach the socket to a specific transport, based on the type of interface the local address is attached to. From that point on, the socket can only reach destinations which are available through this transport. .PP For instance, when binding to the address of an Infiniband interface such as .BR ib0 , the socket will use the Infiniband transport. If RDS is not able to associate a transport with the given address, it will return .BR EADDRNOTAVAIL . .PP An RDS socket can only be bound to one address and only one socket can be bound to a given address/port pair. If no port is specified in the binding address then an unbound port is selected at random. .PP RDS does not allow the application to bind a previously bound socket to another address. Binding to the wildcard address .B INADDR_ANY is not permitted either. .\"------------------------------------------------------------------ .SS Connecting The default mode of operation for RDS is to use unconnected socket, and specify a destination address as an argument to .BR sendmsg . However, RDS allows sockets to be connected to a remote end point using .BR connect (2). If a socket is connected, calling .BR sendmsg without specifying a destination address will use the previously given remote address. .\"------------------------------------------------------------------ .SS Congestion Control RDS does not have explicit congestion control like common streaming protocols such as TCP. However, sockets have two queue limits associated with them; the send queue size and the receive queue size. Messages are accounted based on the number of bytes of payload. .PP The send queue size limits how much data local processes can queue on a local socket (see the following section). If that limit is exceeded, the kernel will not accept further messages until the queue is drained and messages have been delivered to and acknowledged by the remote host. .PP The receive queue size limits how much data RDS will put on the receive queue of a socket before marking the socket as .IR congested . When a socket becomes congested, RDS will send a .I congestion map update to the other participating hosts, who are then expected to stop sending more messages to this port. .PP There is a timing window during which a remote host can still continue to send messages to a congested port; RDS solves this by accepting these messages even if the socket's receive queue is already over the limit. .PP As the application pulls incoming messages off the receive queue using .BR recvmsg (2), the number of bytes on the receive queue will eventually drop below the receive queue size, at which point the port is then marked uncongested, and another congestion update is sent to all participating hosts. This tells them to allow applications to send additional messages to this port. .PP The default values for the send and receive buffer size are controlled by the A given RDS socket has limited transmit buffer space. It defaults to the system wide socket send buffer size set in the .B wmem_default and .B rmem_default sysctls, respectively. They can be tuned by the application through the .B SO_SNDBUF and .B SO_RCVBUF socket options. .PP .\"------------------------------------------------------------------ .SS Blocking Behavior The .BR sendmsg (2) and .BR recvmsg (2) calls can block in a variety of situations. Whether a call blocks or returns with an error depends on the non-blocking setting of the file descriptor and the .B MSG_DONTWAIT message flag. If the file descriptor is set to blocking mode (which is the default), and the .B MSG_DONTWAIT flag is .I not given, the call will block. .PP In addition, the .B SO_SNDTIMEO and .B SO_RCVTIMEO socket options can be used to specify a timeout (in seconds) after which the call will abort waiting, and return an error. The default timeout is 0, which tells RDS to block indefinitely. .\"------------------------------------------------------------------ .SS Message Transmission Messages may be sent using .BR sendmsg (2) once the RDS socket is bound. Message length cannot exceed 4 gigabytes as the wire protocol uses an unsigned 32 bit integer to express the message length. .PP RDS does not support out of band data. Applications are allowed to send to unicast addresses only; broadcast or multicast are not supported. .PP A successful .BR sendmsg (2) call puts the message in the socket's transmit queue where it will remain until either the destination acknowledges that the message is no longer in the network or the application removes the message from the send queue. .PP Messages can be removed from the send queue with the RDS_CANCEL_SENT_TO socket option described below. .PP While a message is in the transmit queue its payload bytes are accounted for. If an attempt is made to send a message while there is not sufficient room on the transmit queue, the call will either block or return .BR EAGAIN . .PP Trying to send to a destination that is marked congested (see above), the call will either block or return .BR ENOBUFS . .PP A message sent with no payload bytes will not consume any space in the destination's send buffer but will result in a message receipt on the destination. The receiver will not get any payload data but will be able to see the sender's address. .PP Messages sent to a port to which no socket is bound will be silently discarded by the destination host. No error messages are reported to the sender. .\"------------------------------------------------------------------ .SS Message Receipt Messages may be received with .BR recvmsg (2) on an RDS socket once it is bound to a source address. RDS will return messages in-order, i.e. messages from the same sender will arrive in the same order in which they were be sent. .PP The address of the sender will be returned in the .B sockaddr_in structure pointed to by the .B msg_name field, if set. .PP If the .B MSG_PEEK flag is given, the first message on the receive is returned without removing it from the queue. .PP The memory consumed by messages waiting for delivery does not limit the number of messages that can be queued for receive. RDS does attempt to perform congestion control as described in the section above. .PP If the length of the message exceeds the size of the buffer provided to .BR recvmsg (2), then the remainder of the bytes in the message are discarded and the .BR MSG_TRUNC flag is set in the msg_flags field. In this truncating case .BR recvmsg (2) will still return the number of bytes copied, not the length of entire messge. If .BR MSG_TRUNC is set in the flags argument to .BR recvmsg (2), then it will return the number of bytes in the entire message. Thus one can examine the size of the next message in the receive queue without incurring a copying overhead by providing a zero length buffer and setting .BR MSG_PEEK " and " MSG_TRUNC in the flags argument. .PP The sending address of a zero-length message will still be provided in the .B msg_name field. .\"------------------------------------------------------------------ .SS Control Messages RDS uses control messages (a.k.a. ancillary data) through the .B msg_control and .B msg_controllen fields in .BR sendmsg (2) and .BR recvmsg (2). Control messages generated by RDS have a .BR cmsg_level value of .BR sol_rds . Most control messages are related to the zerocopy interface added in RDS version 3, and are described in .BR rds-rdma (7). .PP The only exception is the .BR RDS_CMSG_CONG_UPDATE message, which is described in the following section. .\"------------------------------------------------------------------ .SS Polling RDS supports the .BR poll (2) interface in a limited fashion. .B POLLIN is returned when there is a message (either a proper RDS message, or a control message) waiting in the socket's receive queue. .B POLLOUT is always returned while there is room on the socket's send queue. .PP Sending to congested ports requires special handling. When an application tries to send to a congested destination, the system call will return .BR ENOBUFS . However, it cannot poll for .BR POLLOUT , as there is probably still room on the transmit queue, so the call to .BR poll (2) would return immediately, even though the destination is still congested. .PP There are two ways of dealing with this situation. The first is to simply poll for .BR POLLIN . By default, a process sleeping in .BR poll (2) is always woken up when the congestion map is updated, and thus the application can retry any previously congested sends. .PP The second option is explicit congestion monitoring, which gives the application more fine-grained control. .PP With explicit monitoring, the application polls for .B POLLIN as before, and additionally uses the .BR RDS_CONG_MONITOR socket option to install a 64bit mask value in the socket, where each bit corresponds to a group of ports. When a congestion update arrives, RDS checks the set of ports that became uncongested against the bit mask installed in the socket. If they overlap, a control messages is enqueued on the socket, and the application is woken up. When it calls .BR recvmsg (2), it will be given the control message containing the bitmap. on the socket. .PP The congestion monitor bitmask can be set and queried using .BR setsockopt (2) with .BR RDS_CONG_MONITOR , and a pointer to the 64bit mask variable. .PP Congestion updates are delivered to the application via .B RDS_CMSG_CONG_UPDATE control messages. These control messages are always delivered by themselves (or possibly additional control messages), but never along with a RDS data message. The .BR cmsg_data field of the control message is an 8 byte datum containing the 64bit mask value. .PP Applications can use the following macros to test for and set bits in the bitmask: .PP .nf #define RDS_CONG_MONITOR_SIZE 64 #define RDS_CONG_MONITOR_BIT(port) (((unsigned int) port) % RDS_CONG_MONITOR_SIZE) #define RDS_CONG_MONITOR_MASK(port) (1 << RDS_CONG_MONITOR_BIT(port)) .fi .PP .\"------------------------------------------------------------------ .SS Canceling Messages An application can cancel (flush) messages from the send queue using the .BR RDS_CANCEL_SENT_TO socket option with .BR setsockopt (2). This call takes an optional .B sockaddr_in address structure as argument. If given, only messages to the destination specified by this address are discarded. If no address is given, all pending messages are discarded. .PP Note that this affects messages that have not yet been transmitted as well as messages that have been transmitted, but for which no acknowledgment from the remote host has been received yet. .\"------------------------------------------------------------------ .SS Reliability If .BR sendmsg (2) succeeds, RDS guarantees that the message will be visible to .BR recvmsg (2) on a socket bound to the destination address as long as that destination socket remains open. .PP If there is no socket bound on the destination, the message is silently dropped. If the sending RDS can't be sure that there is no socket bound then it will try to send the message indefinitely until it can be sure or the sent message is canceled. .PP If a socket is closed then all pending sent messages on the socket are canceled and may or may not be seen by the receiver. .PP The RDS_CANCEL_SENT_TO socket option can be used to cancel all pending messages to a given destination. .PP If a receiving socket is closed with pending messages then the sender considers those messages as having left the network and will not retransmit them. .PP A message will only be seen by .BR recvmsg (2) once, unless .B MSG_PEEK was specified. Once the message has been delivered it is removed from the sending socket's transmit queue. .PP All messages sent from the same socket to the same destination will be delivered in the order they're sent. Messages sent from different sockets, or to different destinations, may be delivered in any order. .\"------------------------------------------------------------------ .SH SYSCTL VALUES These parameteres may only be accessed through their files in .BR /proc/sys/net/rds . Access through .BR sysctl (2) is not supported. .TP .B pf_rds This file contains the string representation of the protocol family constant passed to .BR socket (2) to create a new RDS socket. .TP .B sol_rds This file contains the string representation of the socket level parameter that is passed to .BR getsockopt (2) and .BR setsockopt (2) to manipulate RDS socket options. .TP .BR max_unacked_bytes " and " max_unacked_packets These parameters are used to tune the generation of acknowledgements. By default, the system receiving RDS messages does not send back explicit acknowledgements unless it transmits a message of its own (in which case the ACK is piggybacked onto the outgoing message), or when the sending system requests an ACK. .IP However, the sender needs to see an ACK from time to time so that it can purge old messages from the send queue. The unacked bytes and packet counters are used to keep track of how much data has been sent without requesting an ACK. The default is to request an acknowledgement every 16 packets, or every 16 MB, whichever comes first. .TP .BR reconnect_delay_min_ms " and " reconnect_delay_max_ms RDS uses host-to-host connections to transport RDS messages (both for the TCP and the Infiniband transport). If this connection breaks, RDS will try to re-establish the connection. Because this reconnect may be triggered by both hosts at the same time and fail, RDS uses a random backoff before attempting a reconnect. These two parameters specify the minimum and maximum delay in milliseconds. The default values are 1 and 1000, respectively. .SH SEE ALSO .BR rds-rdma (7), .BR socket (2), .BR bind (2), .BR sendmsg (2), .BR recvmsg (2), .BR getsockopt(2), .BR setsockopt (2). trunk/README0000644000175000017500000000031111313644724012423 0ustar benoitbenoit == Short build instructions == autoconf ./configure make rpm This should result in an rds-tools rpm which is versioned by the VERSION in the Makefile and the subversion rev that was checked out. trunk/rds-tools.spec.in0000644000175000017500000000140211313644724014754 0ustar benoitbenoitSummary: RDS support tools Name: rds-tools Version: @VERSION@ Release: @RELEASE@ License: GPL/BSD Group: Applications/Internet URL: http://oss.oracle.com/projects/rds/ Source: rds-tools-%{version}-%{release}.tar.gz BuildRoot: /var/tmp/rds-tools-%{version}-%{release} %description rds-tools is a collection of support tools for the RDS socket API. %prep %setup -n rds-tools-%{version}-%{release} %build %configure make %{?_smp_mflags} %install rm -rf $RPM_BUILD_ROOT make DESTDIR=$RPM_BUILD_ROOT install %clean rm -rf $RPM_BUILD_ROOT %files %defattr(-,root,root) %{_bindir}/* %{_mandir}/* %{_includedir}/* %changelog * Sun Nov 25 2007 Vladimir Sokolovsky - Use DESTDIR * Mon Oct 27 2006 Zach Brown - initial version trunk/rdstool.h0000644000175000017500000000620411313644724013411 0ustar benoitbenoit/* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ /* -*- mode: c; c-basic-offset: 8; -*- * vim: noexpandtab sw=8 ts=8 sts=0: * * tools header stuff */ #ifndef __RDS_TOOL_H #define __RDS_TOOL_H #include #include "kernel-list.h" #include "pfhack.h" #ifndef AF_RDS # define AF_RDS OFFICIAL_PF_RDS #endif #ifndef PF_RDS # define PF_RDS AF_RDS #endif #ifndef SOL_RDS # define SOL_RDS OFFICIAL_SOL_RDS #endif #define RDS_TOOL_BASE_OPTS ":s:m:f:i:-:vqhV" #define RDS_SINK_OPTS #define RDS_GEN_OPTS "d:b:l:" #define RDS_DEFAULT_MSG_SIZE 4096 #define verbosef(lvl, f, fmt, a...) do { \ if (verbose >= (lvl)) \ fprintf((f), fmt, ##a); \ } while (0) struct rds_endpoint { struct list_head re_item; char *re_name; struct sockaddr_in re_addr; int re_fd; }; struct rds_context { struct rds_endpoint *rc_saddr; struct list_head rc_daddrs; const char *rc_filename; uint32_t rc_msgsize; uint64_t rc_total; }; /* Set by parse_options() */ extern char *progname; extern unsigned int verbose; extern int parse_options(int argc, char *argv[], const char *opts, struct rds_context *ctxt); extern int rds_bind(struct rds_context *ctxt); extern int dup_file(struct rds_context *ctxt, int fd, int flags); extern int setup_signals(void); extern int runningp(void); /* stats.c */ extern int stats_init(int delay); extern void stats_extended(int extendedp); extern void stats_start(void); extern void stats_print(void); extern void stats_total(void); extern void stats_add_recv(uint64_t bytes); extern void stats_add_send(uint64_t bytes); extern uint64_t stats_get_send(void); extern void stats_add_read(uint64_t bytes); extern void stats_add_write(uint64_t bytes); /* Provided by C files with main() */ extern void print_usage(int rc); extern void print_version(void); #endif /* __RDS_TOOL_H */ trunk/configure0000755000175000017500000017676011313644724013500 0ustar benoitbenoit#! /bin/sh # Guess values for system-dependent variables and create Makefiles. # Generated by GNU Autoconf 2.59. # # Copyright (C) 2003 Free Software Foundation, Inc. # This configure script is free software; the Free Software Foundation # gives unlimited permission to copy, distribute and modify it. ## --------------------- ## ## M4sh Initialization. ## ## --------------------- ## # Be Bourne compatible if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then emulate sh NULLCMD=: # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which # is contrary to our usage. Disable this feature. alias -g '${1+"$@"}'='"$@"' elif test -n "${BASH_VERSION+set}" && (set -o posix) >/dev/null 2>&1; then set -o posix fi DUALCASE=1; export DUALCASE # for MKS sh # Support unset when possible. if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then as_unset=unset else as_unset=false fi # Work around bugs in pre-3.0 UWIN ksh. $as_unset ENV MAIL MAILPATH PS1='$ ' PS2='> ' PS4='+ ' # NLS nuisances. for as_var in \ LANG LANGUAGE LC_ADDRESS LC_ALL LC_COLLATE LC_CTYPE LC_IDENTIFICATION \ LC_MEASUREMENT LC_MESSAGES LC_MONETARY LC_NAME LC_NUMERIC LC_PAPER \ LC_TELEPHONE LC_TIME do if (set +x; test -z "`(eval $as_var=C; export $as_var) 2>&1`"); then eval $as_var=C; export $as_var else $as_unset $as_var fi done # Required to use basename. if expr a : '\(a\)' >/dev/null 2>&1; then as_expr=expr else as_expr=false fi if (basename /) >/dev/null 2>&1 && test "X`basename / 2>&1`" = "X/"; then as_basename=basename else as_basename=false fi # Name of the executable. as_me=`$as_basename "$0" || $as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ X"$0" : 'X\(//\)$' \| \ X"$0" : 'X\(/\)$' \| \ . : '\(.\)' 2>/dev/null || echo X/"$0" | sed '/^.*\/\([^/][^/]*\)\/*$/{ s//\1/; q; } /^X\/\(\/\/\)$/{ s//\1/; q; } /^X\/\(\/\).*/{ s//\1/; q; } s/.*/./; q'` # PATH needs CR, and LINENO needs CR and PATH. # Avoid depending upon Character Ranges. as_cr_letters='abcdefghijklmnopqrstuvwxyz' as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' as_cr_Letters=$as_cr_letters$as_cr_LETTERS as_cr_digits='0123456789' as_cr_alnum=$as_cr_Letters$as_cr_digits # The user is always right. if test "${PATH_SEPARATOR+set}" != set; then echo "#! /bin/sh" >conf$$.sh echo "exit 0" >>conf$$.sh chmod +x conf$$.sh if (PATH="/nonexistent;."; conf$$.sh) >/dev/null 2>&1; then PATH_SEPARATOR=';' else PATH_SEPARATOR=: fi rm -f conf$$.sh fi as_lineno_1=$LINENO as_lineno_2=$LINENO as_lineno_3=`(expr $as_lineno_1 + 1) 2>/dev/null` test "x$as_lineno_1" != "x$as_lineno_2" && test "x$as_lineno_3" = "x$as_lineno_2" || { # Find who we are. Look in the path if we contain no path at all # relative or not. case $0 in *[\\/]* ) as_myself=$0 ;; *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break done ;; esac # We did not find ourselves, most probably we were run as `sh COMMAND' # in which case we are not to be found in the path. if test "x$as_myself" = x; then as_myself=$0 fi if test ! -f "$as_myself"; then { echo "$as_me: error: cannot find myself; rerun with an absolute path" >&2 { (exit 1); exit 1; }; } fi case $CONFIG_SHELL in '') as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for as_base in sh bash ksh sh5; do case $as_dir in /*) if ("$as_dir/$as_base" -c ' as_lineno_1=$LINENO as_lineno_2=$LINENO as_lineno_3=`(expr $as_lineno_1 + 1) 2>/dev/null` test "x$as_lineno_1" != "x$as_lineno_2" && test "x$as_lineno_3" = "x$as_lineno_2" ') 2>/dev/null; then $as_unset BASH_ENV || test "${BASH_ENV+set}" != set || { BASH_ENV=; export BASH_ENV; } $as_unset ENV || test "${ENV+set}" != set || { ENV=; export ENV; } CONFIG_SHELL=$as_dir/$as_base export CONFIG_SHELL exec "$CONFIG_SHELL" "$0" ${1+"$@"} fi;; esac done done ;; esac # Create $as_me.lineno as a copy of $as_myself, but with $LINENO # uniformly replaced by the line number. The first 'sed' inserts a # line-number line before each line; the second 'sed' does the real # work. The second script uses 'N' to pair each line-number line # with the numbered line, and appends trailing '-' during # substitution so that $LINENO is not a special case at line end. # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the # second 'sed' script. Blame Lee E. McMahon for sed's syntax. :-) sed '=' <$as_myself | sed ' N s,$,-, : loop s,^\(['$as_cr_digits']*\)\(.*\)[$]LINENO\([^'$as_cr_alnum'_]\),\1\2\1\3, t loop s,-$,, s,^['$as_cr_digits']*\n,, ' >$as_me.lineno && chmod +x $as_me.lineno || { echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2 { (exit 1); exit 1; }; } # Don't try to exec as it changes $[0], causing all sort of problems # (the dirname of $[0] is not the place where we might find the # original and so on. Autoconf is especially sensible to this). . ./$as_me.lineno # Exit status is that of the last command. exit } case `echo "testing\c"; echo 1,2,3`,`echo -n testing; echo 1,2,3` in *c*,-n*) ECHO_N= ECHO_C=' ' ECHO_T=' ' ;; *c*,* ) ECHO_N=-n ECHO_C= ECHO_T= ;; *) ECHO_N= ECHO_C='\c' ECHO_T= ;; esac if expr a : '\(a\)' >/dev/null 2>&1; then as_expr=expr else as_expr=false fi rm -f conf$$ conf$$.exe conf$$.file echo >conf$$.file if ln -s conf$$.file conf$$ 2>/dev/null; then # We could just check for DJGPP; but this test a) works b) is more generic # and c) will remain valid once DJGPP supports symlinks (DJGPP 2.04). if test -f conf$$.exe; then # Don't use ln at all; we don't have any links as_ln_s='cp -p' else as_ln_s='ln -s' fi elif ln conf$$.file conf$$ 2>/dev/null; then as_ln_s=ln else as_ln_s='cp -p' fi rm -f conf$$ conf$$.exe conf$$.file if mkdir -p . 2>/dev/null; then as_mkdir_p=: else test -d ./-p && rmdir ./-p as_mkdir_p=false fi as_executable_p="test -f" # Sed expression to map a string onto a valid CPP name. as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" # Sed expression to map a string onto a valid variable name. as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" # IFS # We need space, tab and new line, in precisely that order. as_nl=' ' IFS=" $as_nl" # CDPATH. $as_unset CDPATH # Name of the host. # hostname on some systems (SVR3.2, Linux) returns a bogus exit status, # so uname gets run too. ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q` exec 6>&1 # # Initializations. # ac_default_prefix=/usr/local ac_config_libobj_dir=. cross_compiling=no subdirs= MFLAGS= MAKEFLAGS= SHELL=${CONFIG_SHELL-/bin/sh} # Maximum number of lines to put in a shell here document. # This variable seems obsolete. It should probably be removed, and # only ac_max_sed_lines should be used. : ${ac_max_here_lines=38} # Identity of this package. PACKAGE_NAME= PACKAGE_TARNAME= PACKAGE_VERSION= PACKAGE_STRING= PACKAGE_BUGREPORT= ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS VERSION RELEASE LIBOBJS LTLIBOBJS' ac_subst_files='' # Initialize some variables set by options. ac_init_help= ac_init_version=false # The variables have the same names as the options, with # dashes changed to underlines. cache_file=/dev/null exec_prefix=NONE no_create= no_recursion= prefix=NONE program_prefix=NONE program_suffix=NONE program_transform_name=s,x,x, silent= site= srcdir= verbose= x_includes=NONE x_libraries=NONE # Installation directory options. # These are left unexpanded so users can "make install exec_prefix=/foo" # and all the variables that are supposed to be based on exec_prefix # by default will actually change. # Use braces instead of parens because sh, perl, etc. also accept them. bindir='${exec_prefix}/bin' sbindir='${exec_prefix}/sbin' libexecdir='${exec_prefix}/libexec' datadir='${prefix}/share' sysconfdir='${prefix}/etc' sharedstatedir='${prefix}/com' localstatedir='${prefix}/var' libdir='${exec_prefix}/lib' includedir='${prefix}/include' oldincludedir='/usr/include' infodir='${prefix}/info' mandir='${prefix}/man' ac_prev= for ac_option do # If the previous option needs an argument, assign it. if test -n "$ac_prev"; then eval "$ac_prev=\$ac_option" ac_prev= continue fi ac_optarg=`expr "x$ac_option" : 'x[^=]*=\(.*\)'` # Accept the important Cygnus configure options, so we can diagnose typos. case $ac_option in -bindir | --bindir | --bindi | --bind | --bin | --bi) ac_prev=bindir ;; -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*) bindir=$ac_optarg ;; -build | --build | --buil | --bui | --bu) ac_prev=build_alias ;; -build=* | --build=* | --buil=* | --bui=* | --bu=*) build_alias=$ac_optarg ;; -cache-file | --cache-file | --cache-fil | --cache-fi \ | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c) ac_prev=cache_file ;; -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \ | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*) cache_file=$ac_optarg ;; --config-cache | -C) cache_file=config.cache ;; -datadir | --datadir | --datadi | --datad | --data | --dat | --da) ac_prev=datadir ;; -datadir=* | --datadir=* | --datadi=* | --datad=* | --data=* | --dat=* \ | --da=*) datadir=$ac_optarg ;; -disable-* | --disable-*) ac_feature=`expr "x$ac_option" : 'x-*disable-\(.*\)'` # Reject names that are not valid shell variable names. expr "x$ac_feature" : ".*[^-_$as_cr_alnum]" >/dev/null && { echo "$as_me: error: invalid feature name: $ac_feature" >&2 { (exit 1); exit 1; }; } ac_feature=`echo $ac_feature | sed 's/-/_/g'` eval "enable_$ac_feature=no" ;; -enable-* | --enable-*) ac_feature=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'` # Reject names that are not valid shell variable names. expr "x$ac_feature" : ".*[^-_$as_cr_alnum]" >/dev/null && { echo "$as_me: error: invalid feature name: $ac_feature" >&2 { (exit 1); exit 1; }; } ac_feature=`echo $ac_feature | sed 's/-/_/g'` case $ac_option in *=*) ac_optarg=`echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"`;; *) ac_optarg=yes ;; esac eval "enable_$ac_feature='$ac_optarg'" ;; -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \ | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \ | --exec | --exe | --ex) ac_prev=exec_prefix ;; -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \ | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \ | --exec=* | --exe=* | --ex=*) exec_prefix=$ac_optarg ;; -gas | --gas | --ga | --g) # Obsolete; use --with-gas. with_gas=yes ;; -help | --help | --hel | --he | -h) ac_init_help=long ;; -help=r* | --help=r* | --hel=r* | --he=r* | -hr*) ac_init_help=recursive ;; -help=s* | --help=s* | --hel=s* | --he=s* | -hs*) ac_init_help=short ;; -host | --host | --hos | --ho) ac_prev=host_alias ;; -host=* | --host=* | --hos=* | --ho=*) host_alias=$ac_optarg ;; -includedir | --includedir | --includedi | --included | --include \ | --includ | --inclu | --incl | --inc) ac_prev=includedir ;; -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \ | --includ=* | --inclu=* | --incl=* | --inc=*) includedir=$ac_optarg ;; -infodir | --infodir | --infodi | --infod | --info | --inf) ac_prev=infodir ;; -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*) infodir=$ac_optarg ;; -libdir | --libdir | --libdi | --libd) ac_prev=libdir ;; -libdir=* | --libdir=* | --libdi=* | --libd=*) libdir=$ac_optarg ;; -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \ | --libexe | --libex | --libe) ac_prev=libexecdir ;; -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \ | --libexe=* | --libex=* | --libe=*) libexecdir=$ac_optarg ;; -localstatedir | --localstatedir | --localstatedi | --localstated \ | --localstate | --localstat | --localsta | --localst \ | --locals | --local | --loca | --loc | --lo) ac_prev=localstatedir ;; -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \ | --localstate=* | --localstat=* | --localsta=* | --localst=* \ | --locals=* | --local=* | --loca=* | --loc=* | --lo=*) localstatedir=$ac_optarg ;; -mandir | --mandir | --mandi | --mand | --man | --ma | --m) ac_prev=mandir ;; -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*) mandir=$ac_optarg ;; -nfp | --nfp | --nf) # Obsolete; use --without-fp. with_fp=no ;; -no-create | --no-create | --no-creat | --no-crea | --no-cre \ | --no-cr | --no-c | -n) no_create=yes ;; -no-recursion | --no-recursion | --no-recursio | --no-recursi \ | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) no_recursion=yes ;; -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \ | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \ | --oldin | --oldi | --old | --ol | --o) ac_prev=oldincludedir ;; -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \ | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \ | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*) oldincludedir=$ac_optarg ;; -prefix | --prefix | --prefi | --pref | --pre | --pr | --p) ac_prev=prefix ;; -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*) prefix=$ac_optarg ;; -program-prefix | --program-prefix | --program-prefi | --program-pref \ | --program-pre | --program-pr | --program-p) ac_prev=program_prefix ;; -program-prefix=* | --program-prefix=* | --program-prefi=* \ | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*) program_prefix=$ac_optarg ;; -program-suffix | --program-suffix | --program-suffi | --program-suff \ | --program-suf | --program-su | --program-s) ac_prev=program_suffix ;; -program-suffix=* | --program-suffix=* | --program-suffi=* \ | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*) program_suffix=$ac_optarg ;; -program-transform-name | --program-transform-name \ | --program-transform-nam | --program-transform-na \ | --program-transform-n | --program-transform- \ | --program-transform | --program-transfor \ | --program-transfo | --program-transf \ | --program-trans | --program-tran \ | --progr-tra | --program-tr | --program-t) ac_prev=program_transform_name ;; -program-transform-name=* | --program-transform-name=* \ | --program-transform-nam=* | --program-transform-na=* \ | --program-transform-n=* | --program-transform-=* \ | --program-transform=* | --program-transfor=* \ | --program-transfo=* | --program-transf=* \ | --program-trans=* | --program-tran=* \ | --progr-tra=* | --program-tr=* | --program-t=*) program_transform_name=$ac_optarg ;; -q | -quiet | --quiet | --quie | --qui | --qu | --q \ | -silent | --silent | --silen | --sile | --sil) silent=yes ;; -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) ac_prev=sbindir ;; -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ | --sbi=* | --sb=*) sbindir=$ac_optarg ;; -sharedstatedir | --sharedstatedir | --sharedstatedi \ | --sharedstated | --sharedstate | --sharedstat | --sharedsta \ | --sharedst | --shareds | --shared | --share | --shar \ | --sha | --sh) ac_prev=sharedstatedir ;; -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \ | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \ | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \ | --sha=* | --sh=*) sharedstatedir=$ac_optarg ;; -site | --site | --sit) ac_prev=site ;; -site=* | --site=* | --sit=*) site=$ac_optarg ;; -srcdir | --srcdir | --srcdi | --srcd | --src | --sr) ac_prev=srcdir ;; -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*) srcdir=$ac_optarg ;; -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \ | --syscon | --sysco | --sysc | --sys | --sy) ac_prev=sysconfdir ;; -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \ | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*) sysconfdir=$ac_optarg ;; -target | --target | --targe | --targ | --tar | --ta | --t) ac_prev=target_alias ;; -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*) target_alias=$ac_optarg ;; -v | -verbose | --verbose | --verbos | --verbo | --verb) verbose=yes ;; -version | --version | --versio | --versi | --vers | -V) ac_init_version=: ;; -with-* | --with-*) ac_package=`expr "x$ac_option" : 'x-*with-\([^=]*\)'` # Reject names that are not valid shell variable names. expr "x$ac_package" : ".*[^-_$as_cr_alnum]" >/dev/null && { echo "$as_me: error: invalid package name: $ac_package" >&2 { (exit 1); exit 1; }; } ac_package=`echo $ac_package| sed 's/-/_/g'` case $ac_option in *=*) ac_optarg=`echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"`;; *) ac_optarg=yes ;; esac eval "with_$ac_package='$ac_optarg'" ;; -without-* | --without-*) ac_package=`expr "x$ac_option" : 'x-*without-\(.*\)'` # Reject names that are not valid shell variable names. expr "x$ac_package" : ".*[^-_$as_cr_alnum]" >/dev/null && { echo "$as_me: error: invalid package name: $ac_package" >&2 { (exit 1); exit 1; }; } ac_package=`echo $ac_package | sed 's/-/_/g'` eval "with_$ac_package=no" ;; --x) # Obsolete; use --with-x. with_x=yes ;; -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \ | --x-incl | --x-inc | --x-in | --x-i) ac_prev=x_includes ;; -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \ | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*) x_includes=$ac_optarg ;; -x-libraries | --x-libraries | --x-librarie | --x-librari \ | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l) ac_prev=x_libraries ;; -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \ | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*) x_libraries=$ac_optarg ;; -*) { echo "$as_me: error: unrecognized option: $ac_option Try \`$0 --help' for more information." >&2 { (exit 1); exit 1; }; } ;; *=*) ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='` # Reject names that are not valid shell variable names. expr "x$ac_envvar" : ".*[^_$as_cr_alnum]" >/dev/null && { echo "$as_me: error: invalid variable name: $ac_envvar" >&2 { (exit 1); exit 1; }; } ac_optarg=`echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` eval "$ac_envvar='$ac_optarg'" export $ac_envvar ;; *) # FIXME: should be removed in autoconf 3.0. echo "$as_me: WARNING: you should use --build, --host, --target" >&2 expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null && echo "$as_me: WARNING: invalid host type: $ac_option" >&2 : ${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option} ;; esac done if test -n "$ac_prev"; then ac_option=--`echo $ac_prev | sed 's/_/-/g'` { echo "$as_me: error: missing argument to $ac_option" >&2 { (exit 1); exit 1; }; } fi # Be sure to have absolute paths. for ac_var in exec_prefix prefix do eval ac_val=$`echo $ac_var` case $ac_val in [\\/$]* | ?:[\\/]* | NONE | '' ) ;; *) { echo "$as_me: error: expected an absolute directory name for --$ac_var: $ac_val" >&2 { (exit 1); exit 1; }; };; esac done # Be sure to have absolute paths. for ac_var in bindir sbindir libexecdir datadir sysconfdir sharedstatedir \ localstatedir libdir includedir oldincludedir infodir mandir do eval ac_val=$`echo $ac_var` case $ac_val in [\\/$]* | ?:[\\/]* ) ;; *) { echo "$as_me: error: expected an absolute directory name for --$ac_var: $ac_val" >&2 { (exit 1); exit 1; }; };; esac done # There might be people who depend on the old broken behavior: `$host' # used to hold the argument of --host etc. # FIXME: To remove some day. build=$build_alias host=$host_alias target=$target_alias # FIXME: To remove some day. if test "x$host_alias" != x; then if test "x$build_alias" = x; then cross_compiling=maybe echo "$as_me: WARNING: If you wanted to set the --build type, don't use --host. If a cross compiler is detected then cross compile mode will be used." >&2 elif test "x$build_alias" != "x$host_alias"; then cross_compiling=yes fi fi ac_tool_prefix= test -n "$host_alias" && ac_tool_prefix=$host_alias- test "$silent" = yes && exec 6>/dev/null # Find the source files, if location was not specified. if test -z "$srcdir"; then ac_srcdir_defaulted=yes # Try the directory containing this script, then its parent. ac_confdir=`(dirname "$0") 2>/dev/null || $as_expr X"$0" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$0" : 'X\(//\)[^/]' \| \ X"$0" : 'X\(//\)$' \| \ X"$0" : 'X\(/\)' \| \ . : '\(.\)' 2>/dev/null || echo X"$0" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; } /^X\(\/\/\)[^/].*/{ s//\1/; q; } /^X\(\/\/\)$/{ s//\1/; q; } /^X\(\/\).*/{ s//\1/; q; } s/.*/./; q'` srcdir=$ac_confdir if test ! -r $srcdir/$ac_unique_file; then srcdir=.. fi else ac_srcdir_defaulted=no fi if test ! -r $srcdir/$ac_unique_file; then if test "$ac_srcdir_defaulted" = yes; then { echo "$as_me: error: cannot find sources ($ac_unique_file) in $ac_confdir or .." >&2 { (exit 1); exit 1; }; } else { echo "$as_me: error: cannot find sources ($ac_unique_file) in $srcdir" >&2 { (exit 1); exit 1; }; } fi fi (cd $srcdir && test -r ./$ac_unique_file) 2>/dev/null || { echo "$as_me: error: sources are in $srcdir, but \`cd $srcdir' does not work" >&2 { (exit 1); exit 1; }; } srcdir=`echo "$srcdir" | sed 's%\([^\\/]\)[\\/]*$%\1%'` ac_env_build_alias_set=${build_alias+set} ac_env_build_alias_value=$build_alias ac_cv_env_build_alias_set=${build_alias+set} ac_cv_env_build_alias_value=$build_alias ac_env_host_alias_set=${host_alias+set} ac_env_host_alias_value=$host_alias ac_cv_env_host_alias_set=${host_alias+set} ac_cv_env_host_alias_value=$host_alias ac_env_target_alias_set=${target_alias+set} ac_env_target_alias_value=$target_alias ac_cv_env_target_alias_set=${target_alias+set} ac_cv_env_target_alias_value=$target_alias # # Report the --help message. # if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF \`configure' configures this package to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... To assign environment variables (e.g., CC, CFLAGS...), specify them as VAR=VALUE. See below for descriptions of some of the useful variables. Defaults for the options are specified in brackets. Configuration: -h, --help display this help and exit --help=short display options specific to this package --help=recursive display the short help of all the included packages -V, --version display version information and exit -q, --quiet, --silent do not print \`checking...' messages --cache-file=FILE cache test results in FILE [disabled] -C, --config-cache alias for \`--cache-file=config.cache' -n, --no-create do not create output files --srcdir=DIR find the sources in DIR [configure dir or \`..'] _ACEOF cat <<_ACEOF Installation directories: --prefix=PREFIX install architecture-independent files in PREFIX [$ac_default_prefix] --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX [PREFIX] By default, \`make install' will install all the files in \`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc. You can specify an installation prefix other than \`$ac_default_prefix' using \`--prefix', for instance \`--prefix=\$HOME'. For better control, use the options below. Fine tuning of the installation directories: --bindir=DIR user executables [EPREFIX/bin] --sbindir=DIR system admin executables [EPREFIX/sbin] --libexecdir=DIR program executables [EPREFIX/libexec] --datadir=DIR read-only architecture-independent data [PREFIX/share] --sysconfdir=DIR read-only single-machine data [PREFIX/etc] --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com] --localstatedir=DIR modifiable single-machine data [PREFIX/var] --libdir=DIR object code libraries [EPREFIX/lib] --includedir=DIR C header files [PREFIX/include] --oldincludedir=DIR C header files for non-gcc [/usr/include] --infodir=DIR info documentation [PREFIX/info] --mandir=DIR man documentation [PREFIX/man] _ACEOF cat <<\_ACEOF _ACEOF fi if test -n "$ac_init_help"; then cat <<\_ACEOF _ACEOF fi if test "$ac_init_help" = "recursive"; then # If there are subdirs, report their specific --help. ac_popdir=`pwd` for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue test -d $ac_dir || continue ac_builddir=. if test "$ac_dir" != .; then ac_dir_suffix=/`echo "$ac_dir" | sed 's,^\.[\\/],,'` # A "../" for each directory in $ac_dir_suffix. ac_top_builddir=`echo "$ac_dir_suffix" | sed 's,/[^\\/]*,../,g'` else ac_dir_suffix= ac_top_builddir= fi case $srcdir in .) # No --srcdir option. We are building in place. ac_srcdir=. if test -z "$ac_top_builddir"; then ac_top_srcdir=. else ac_top_srcdir=`echo $ac_top_builddir | sed 's,/$,,'` fi ;; [\\/]* | ?:[\\/]* ) # Absolute path. ac_srcdir=$srcdir$ac_dir_suffix; ac_top_srcdir=$srcdir ;; *) # Relative path. ac_srcdir=$ac_top_builddir$srcdir$ac_dir_suffix ac_top_srcdir=$ac_top_builddir$srcdir ;; esac # Do not use `cd foo && pwd` to compute absolute paths, because # the directories may not exist. case `pwd` in .) ac_abs_builddir="$ac_dir";; *) case "$ac_dir" in .) ac_abs_builddir=`pwd`;; [\\/]* | ?:[\\/]* ) ac_abs_builddir="$ac_dir";; *) ac_abs_builddir=`pwd`/"$ac_dir";; esac;; esac case $ac_abs_builddir in .) ac_abs_top_builddir=${ac_top_builddir}.;; *) case ${ac_top_builddir}. in .) ac_abs_top_builddir=$ac_abs_builddir;; [\\/]* | ?:[\\/]* ) ac_abs_top_builddir=${ac_top_builddir}.;; *) ac_abs_top_builddir=$ac_abs_builddir/${ac_top_builddir}.;; esac;; esac case $ac_abs_builddir in .) ac_abs_srcdir=$ac_srcdir;; *) case $ac_srcdir in .) ac_abs_srcdir=$ac_abs_builddir;; [\\/]* | ?:[\\/]* ) ac_abs_srcdir=$ac_srcdir;; *) ac_abs_srcdir=$ac_abs_builddir/$ac_srcdir;; esac;; esac case $ac_abs_builddir in .) ac_abs_top_srcdir=$ac_top_srcdir;; *) case $ac_top_srcdir in .) ac_abs_top_srcdir=$ac_abs_builddir;; [\\/]* | ?:[\\/]* ) ac_abs_top_srcdir=$ac_top_srcdir;; *) ac_abs_top_srcdir=$ac_abs_builddir/$ac_top_srcdir;; esac;; esac cd $ac_dir # Check for guested configure; otherwise get Cygnus style configure. if test -f $ac_srcdir/configure.gnu; then echo $SHELL $ac_srcdir/configure.gnu --help=recursive elif test -f $ac_srcdir/configure; then echo $SHELL $ac_srcdir/configure --help=recursive elif test -f $ac_srcdir/configure.ac || test -f $ac_srcdir/configure.in; then echo $ac_configure --help else echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2 fi cd $ac_popdir done fi test -n "$ac_init_help" && exit 0 if $ac_init_version; then cat <<\_ACEOF Copyright (C) 2003 Free Software Foundation, Inc. This configure script is free software; the Free Software Foundation gives unlimited permission to copy, distribute and modify it. _ACEOF exit 0 fi exec 5>config.log cat >&5 <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. It was created by $as_me, which was generated by GNU Autoconf 2.59. Invocation command line was $ $0 $@ _ACEOF { cat <<_ASUNAME ## --------- ## ## Platform. ## ## --------- ## hostname = `(hostname || uname -n) 2>/dev/null | sed 1q` uname -m = `(uname -m) 2>/dev/null || echo unknown` uname -r = `(uname -r) 2>/dev/null || echo unknown` uname -s = `(uname -s) 2>/dev/null || echo unknown` uname -v = `(uname -v) 2>/dev/null || echo unknown` /usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown` /bin/uname -X = `(/bin/uname -X) 2>/dev/null || echo unknown` /bin/arch = `(/bin/arch) 2>/dev/null || echo unknown` /usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null || echo unknown` /usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown` hostinfo = `(hostinfo) 2>/dev/null || echo unknown` /bin/machine = `(/bin/machine) 2>/dev/null || echo unknown` /usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null || echo unknown` /bin/universe = `(/bin/universe) 2>/dev/null || echo unknown` _ASUNAME as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. echo "PATH: $as_dir" done } >&5 cat >&5 <<_ACEOF ## ----------- ## ## Core tests. ## ## ----------- ## _ACEOF # Keep a trace of the command line. # Strip out --no-create and --no-recursion so they do not pile up. # Strip out --silent because we don't want to record it for future runs. # Also quote any args containing shell meta-characters. # Make two passes to allow for proper duplicate-argument suppression. ac_configure_args= ac_configure_args0= ac_configure_args1= ac_sep= ac_must_keep_next=false for ac_pass in 1 2 do for ac_arg do case $ac_arg in -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;; -q | -quiet | --quiet | --quie | --qui | --qu | --q \ | -silent | --silent | --silen | --sile | --sil) continue ;; *" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?\"\']*) ac_arg=`echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;; esac case $ac_pass in 1) ac_configure_args0="$ac_configure_args0 '$ac_arg'" ;; 2) ac_configure_args1="$ac_configure_args1 '$ac_arg'" if test $ac_must_keep_next = true; then ac_must_keep_next=false # Got value, back to normal. else case $ac_arg in *=* | --config-cache | -C | -disable-* | --disable-* \ | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \ | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \ | -with-* | --with-* | -without-* | --without-* | --x) case "$ac_configure_args0 " in "$ac_configure_args1"*" '$ac_arg' "* ) continue ;; esac ;; -* ) ac_must_keep_next=true ;; esac fi ac_configure_args="$ac_configure_args$ac_sep'$ac_arg'" # Get rid of the leading space. ac_sep=" " ;; esac done done $as_unset ac_configure_args0 || test "${ac_configure_args0+set}" != set || { ac_configure_args0=; export ac_configure_args0; } $as_unset ac_configure_args1 || test "${ac_configure_args1+set}" != set || { ac_configure_args1=; export ac_configure_args1; } # When interrupted or exit'd, cleanup temporary files, and complete # config.log. We remove comments because anyway the quotes in there # would cause problems or look ugly. # WARNING: Be sure not to use single quotes in there, as some shells, # such as our DU 5.0 friend, will then `close' the trap. trap 'exit_status=$? # Save into config.log some information that might help in debugging. { echo cat <<\_ASBOX ## ---------------- ## ## Cache variables. ## ## ---------------- ## _ASBOX echo # The following way of writing the cache mishandles newlines in values, { (set) 2>&1 | case `(ac_space='"'"' '"'"'; set | grep ac_space) 2>&1` in *ac_space=\ *) sed -n \ "s/'"'"'/'"'"'\\\\'"'"''"'"'/g; s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='"'"'\\2'"'"'/p" ;; *) sed -n \ "s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1=\\2/p" ;; esac; } echo cat <<\_ASBOX ## ----------------- ## ## Output variables. ## ## ----------------- ## _ASBOX echo for ac_var in $ac_subst_vars do eval ac_val=$`echo $ac_var` echo "$ac_var='"'"'$ac_val'"'"'" done | sort echo if test -n "$ac_subst_files"; then cat <<\_ASBOX ## ------------- ## ## Output files. ## ## ------------- ## _ASBOX echo for ac_var in $ac_subst_files do eval ac_val=$`echo $ac_var` echo "$ac_var='"'"'$ac_val'"'"'" done | sort echo fi if test -s confdefs.h; then cat <<\_ASBOX ## ----------- ## ## confdefs.h. ## ## ----------- ## _ASBOX echo sed "/^$/d" confdefs.h | sort echo fi test "$ac_signal" != 0 && echo "$as_me: caught signal $ac_signal" echo "$as_me: exit $exit_status" } >&5 rm -f core *.core && rm -rf conftest* confdefs* conf$$* $ac_clean_files && exit $exit_status ' 0 for ac_signal in 1 2 13 15; do trap 'ac_signal='$ac_signal'; { (exit 1); exit 1; }' $ac_signal done ac_signal=0 # confdefs.h avoids OS command line length limits that DEFS can exceed. rm -rf conftest* confdefs.h # AIX cpp loses on an empty file, so make sure it contains at least a newline. echo >confdefs.h # Predefined preprocessor variables. cat >>confdefs.h <<_ACEOF #define PACKAGE_NAME "$PACKAGE_NAME" _ACEOF cat >>confdefs.h <<_ACEOF #define PACKAGE_TARNAME "$PACKAGE_TARNAME" _ACEOF cat >>confdefs.h <<_ACEOF #define PACKAGE_VERSION "$PACKAGE_VERSION" _ACEOF cat >>confdefs.h <<_ACEOF #define PACKAGE_STRING "$PACKAGE_STRING" _ACEOF cat >>confdefs.h <<_ACEOF #define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT" _ACEOF # Let the site file select an alternate cache file if it wants to. # Prefer explicitly selected file to automatically selected ones. if test -z "$CONFIG_SITE"; then if test "x$prefix" != xNONE; then CONFIG_SITE="$prefix/share/config.site $prefix/etc/config.site" else CONFIG_SITE="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site" fi fi for ac_site_file in $CONFIG_SITE; do if test -r "$ac_site_file"; then { echo "$as_me:$LINENO: loading site script $ac_site_file" >&5 echo "$as_me: loading site script $ac_site_file" >&6;} sed 's/^/| /' "$ac_site_file" >&5 . "$ac_site_file" fi done if test -r "$cache_file"; then # Some versions of bash will fail to source /dev/null (special # files actually), so we avoid doing that. if test -f "$cache_file"; then { echo "$as_me:$LINENO: loading cache $cache_file" >&5 echo "$as_me: loading cache $cache_file" >&6;} case $cache_file in [\\/]* | ?:[\\/]* ) . $cache_file;; *) . ./$cache_file;; esac fi else { echo "$as_me:$LINENO: creating cache $cache_file" >&5 echo "$as_me: creating cache $cache_file" >&6;} >$cache_file fi # Check that the precious variables saved in the cache have kept the same # value. ac_cache_corrupted=false for ac_var in `(set) 2>&1 | sed -n 's/^ac_env_\([a-zA-Z_0-9]*\)_set=.*/\1/p'`; do eval ac_old_set=\$ac_cv_env_${ac_var}_set eval ac_new_set=\$ac_env_${ac_var}_set eval ac_old_val="\$ac_cv_env_${ac_var}_value" eval ac_new_val="\$ac_env_${ac_var}_value" case $ac_old_set,$ac_new_set in set,) { echo "$as_me:$LINENO: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5 echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;} ac_cache_corrupted=: ;; ,set) { echo "$as_me:$LINENO: error: \`$ac_var' was not set in the previous run" >&5 echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;} ac_cache_corrupted=: ;; ,);; *) if test "x$ac_old_val" != "x$ac_new_val"; then { echo "$as_me:$LINENO: error: \`$ac_var' has changed since the previous run:" >&5 echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;} { echo "$as_me:$LINENO: former value: $ac_old_val" >&5 echo "$as_me: former value: $ac_old_val" >&2;} { echo "$as_me:$LINENO: current value: $ac_new_val" >&5 echo "$as_me: current value: $ac_new_val" >&2;} ac_cache_corrupted=: fi;; esac # Pass precious variables to config.status. if test "$ac_new_set" = set; then case $ac_new_val in *" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?\"\']*) ac_arg=$ac_var=`echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;; *) ac_arg=$ac_var=$ac_new_val ;; esac case " $ac_configure_args " in *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy. *) ac_configure_args="$ac_configure_args '$ac_arg'" ;; esac fi done if $ac_cache_corrupted; then { echo "$as_me:$LINENO: error: changes in the environment can compromise the build" >&5 echo "$as_me: error: changes in the environment can compromise the build" >&2;} { { echo "$as_me:$LINENO: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&5 echo "$as_me: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&2;} { (exit 1); exit 1; }; } fi ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu VERSION=1.4 RELEASE=1 ac_config_files="$ac_config_files Makefile rds-tools.spec" cat >confcache <<\_ACEOF # This file is a shell script that caches the results of configure # tests run on this system so they can be shared between configure # scripts and configure runs, see configure's option --config-cache. # It is not useful on other systems. If it contains results you don't # want to keep, you may remove or edit it. # # config.status only pays attention to the cache file if you give it # the --recheck option to rerun configure. # # `ac_cv_env_foo' variables (set or unset) will be overridden when # loading this file, other *unset* `ac_cv_foo' will be assigned the # following values. _ACEOF # The following way of writing the cache mishandles newlines in values, # but we know of no workaround that is simple, portable, and efficient. # So, don't put newlines in cache variables' values. # Ultrix sh set writes to stderr and can't be redirected directly, # and sets the high bit in the cache file unless we assign to the vars. { (set) 2>&1 | case `(ac_space=' '; set | grep ac_space) 2>&1` in *ac_space=\ *) # `set' does not quote correctly, so add quotes (double-quote # substitution turns \\\\ into \\, and sed turns \\ into \). sed -n \ "s/'/'\\\\''/g; s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p" ;; *) # `set' quotes correctly as required by POSIX, so do not add quotes. sed -n \ "s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1=\\2/p" ;; esac; } | sed ' t clear : clear s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/ t end /^ac_cv_env/!s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/ : end' >>confcache if diff $cache_file confcache >/dev/null 2>&1; then :; else if test -w $cache_file; then test "x$cache_file" != "x/dev/null" && echo "updating cache $cache_file" cat confcache >$cache_file else echo "not updating unwritable cache $cache_file" fi fi rm -f confcache test "x$prefix" = xNONE && prefix=$ac_default_prefix # Let make expand exec_prefix. test "x$exec_prefix" = xNONE && exec_prefix='${prefix}' # VPATH may cause trouble with some makes, so we remove $(srcdir), # ${srcdir} and @srcdir@ from VPATH if srcdir is ".", strip leading and # trailing colons and then remove the whole line if VPATH becomes empty # (actually we leave an empty line to preserve line numbers). if test "x$srcdir" = x.; then ac_vpsub='/^[ ]*VPATH[ ]*=/{ s/:*\$(srcdir):*/:/; s/:*\${srcdir}:*/:/; s/:*@srcdir@:*/:/; s/^\([^=]*=[ ]*\):*/\1/; s/:*$//; s/^[^=]*=[ ]*$//; }' fi # Transform confdefs.h into DEFS. # Protect against shell expansion while executing Makefile rules. # Protect against Makefile macro expansion. # # If the first sed substitution is executed (which looks for macros that # take arguments), then we branch to the quote section. Otherwise, # look for a macro that doesn't take arguments. cat >confdef2opt.sed <<\_ACEOF t clear : clear s,^[ ]*#[ ]*define[ ][ ]*\([^ (][^ (]*([^)]*)\)[ ]*\(.*\),-D\1=\2,g t quote s,^[ ]*#[ ]*define[ ][ ]*\([^ ][^ ]*\)[ ]*\(.*\),-D\1=\2,g t quote d : quote s,[ `~#$^&*(){}\\|;'"<>?],\\&,g s,\[,\\&,g s,\],\\&,g s,\$,$$,g p _ACEOF # We use echo to avoid assuming a particular line-breaking character. # The extra dot is to prevent the shell from consuming trailing # line-breaks from the sub-command output. A line-break within # single-quotes doesn't work because, if this script is created in a # platform that uses two characters for line-breaks (e.g., DOS), tr # would break. ac_LF_and_DOT=`echo; echo .` DEFS=`sed -n -f confdef2opt.sed confdefs.h | tr "$ac_LF_and_DOT" ' .'` rm -f confdef2opt.sed ac_libobjs= ac_ltlibobjs= for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue # 1. Remove the extension, and $U if already installed. ac_i=`echo "$ac_i" | sed 's/\$U\././;s/\.o$//;s/\.obj$//'` # 2. Add them. ac_libobjs="$ac_libobjs $ac_i\$U.$ac_objext" ac_ltlibobjs="$ac_ltlibobjs $ac_i"'$U.lo' done LIBOBJS=$ac_libobjs LTLIBOBJS=$ac_ltlibobjs : ${CONFIG_STATUS=./config.status} ac_clean_files_save=$ac_clean_files ac_clean_files="$ac_clean_files $CONFIG_STATUS" { echo "$as_me:$LINENO: creating $CONFIG_STATUS" >&5 echo "$as_me: creating $CONFIG_STATUS" >&6;} cat >$CONFIG_STATUS <<_ACEOF #! $SHELL # Generated by $as_me. # Run this file to recreate the current configuration. # Compiler output produced by configure, useful for debugging # configure, is in config.log if it exists. debug=false ac_cs_recheck=false ac_cs_silent=false SHELL=\${CONFIG_SHELL-$SHELL} _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF ## --------------------- ## ## M4sh Initialization. ## ## --------------------- ## # Be Bourne compatible if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then emulate sh NULLCMD=: # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which # is contrary to our usage. Disable this feature. alias -g '${1+"$@"}'='"$@"' elif test -n "${BASH_VERSION+set}" && (set -o posix) >/dev/null 2>&1; then set -o posix fi DUALCASE=1; export DUALCASE # for MKS sh # Support unset when possible. if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then as_unset=unset else as_unset=false fi # Work around bugs in pre-3.0 UWIN ksh. $as_unset ENV MAIL MAILPATH PS1='$ ' PS2='> ' PS4='+ ' # NLS nuisances. for as_var in \ LANG LANGUAGE LC_ADDRESS LC_ALL LC_COLLATE LC_CTYPE LC_IDENTIFICATION \ LC_MEASUREMENT LC_MESSAGES LC_MONETARY LC_NAME LC_NUMERIC LC_PAPER \ LC_TELEPHONE LC_TIME do if (set +x; test -z "`(eval $as_var=C; export $as_var) 2>&1`"); then eval $as_var=C; export $as_var else $as_unset $as_var fi done # Required to use basename. if expr a : '\(a\)' >/dev/null 2>&1; then as_expr=expr else as_expr=false fi if (basename /) >/dev/null 2>&1 && test "X`basename / 2>&1`" = "X/"; then as_basename=basename else as_basename=false fi # Name of the executable. as_me=`$as_basename "$0" || $as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ X"$0" : 'X\(//\)$' \| \ X"$0" : 'X\(/\)$' \| \ . : '\(.\)' 2>/dev/null || echo X/"$0" | sed '/^.*\/\([^/][^/]*\)\/*$/{ s//\1/; q; } /^X\/\(\/\/\)$/{ s//\1/; q; } /^X\/\(\/\).*/{ s//\1/; q; } s/.*/./; q'` # PATH needs CR, and LINENO needs CR and PATH. # Avoid depending upon Character Ranges. as_cr_letters='abcdefghijklmnopqrstuvwxyz' as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' as_cr_Letters=$as_cr_letters$as_cr_LETTERS as_cr_digits='0123456789' as_cr_alnum=$as_cr_Letters$as_cr_digits # The user is always right. if test "${PATH_SEPARATOR+set}" != set; then echo "#! /bin/sh" >conf$$.sh echo "exit 0" >>conf$$.sh chmod +x conf$$.sh if (PATH="/nonexistent;."; conf$$.sh) >/dev/null 2>&1; then PATH_SEPARATOR=';' else PATH_SEPARATOR=: fi rm -f conf$$.sh fi as_lineno_1=$LINENO as_lineno_2=$LINENO as_lineno_3=`(expr $as_lineno_1 + 1) 2>/dev/null` test "x$as_lineno_1" != "x$as_lineno_2" && test "x$as_lineno_3" = "x$as_lineno_2" || { # Find who we are. Look in the path if we contain no path at all # relative or not. case $0 in *[\\/]* ) as_myself=$0 ;; *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break done ;; esac # We did not find ourselves, most probably we were run as `sh COMMAND' # in which case we are not to be found in the path. if test "x$as_myself" = x; then as_myself=$0 fi if test ! -f "$as_myself"; then { { echo "$as_me:$LINENO: error: cannot find myself; rerun with an absolute path" >&5 echo "$as_me: error: cannot find myself; rerun with an absolute path" >&2;} { (exit 1); exit 1; }; } fi case $CONFIG_SHELL in '') as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for as_base in sh bash ksh sh5; do case $as_dir in /*) if ("$as_dir/$as_base" -c ' as_lineno_1=$LINENO as_lineno_2=$LINENO as_lineno_3=`(expr $as_lineno_1 + 1) 2>/dev/null` test "x$as_lineno_1" != "x$as_lineno_2" && test "x$as_lineno_3" = "x$as_lineno_2" ') 2>/dev/null; then $as_unset BASH_ENV || test "${BASH_ENV+set}" != set || { BASH_ENV=; export BASH_ENV; } $as_unset ENV || test "${ENV+set}" != set || { ENV=; export ENV; } CONFIG_SHELL=$as_dir/$as_base export CONFIG_SHELL exec "$CONFIG_SHELL" "$0" ${1+"$@"} fi;; esac done done ;; esac # Create $as_me.lineno as a copy of $as_myself, but with $LINENO # uniformly replaced by the line number. The first 'sed' inserts a # line-number line before each line; the second 'sed' does the real # work. The second script uses 'N' to pair each line-number line # with the numbered line, and appends trailing '-' during # substitution so that $LINENO is not a special case at line end. # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the # second 'sed' script. Blame Lee E. McMahon for sed's syntax. :-) sed '=' <$as_myself | sed ' N s,$,-, : loop s,^\(['$as_cr_digits']*\)\(.*\)[$]LINENO\([^'$as_cr_alnum'_]\),\1\2\1\3, t loop s,-$,, s,^['$as_cr_digits']*\n,, ' >$as_me.lineno && chmod +x $as_me.lineno || { { echo "$as_me:$LINENO: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&5 echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2;} { (exit 1); exit 1; }; } # Don't try to exec as it changes $[0], causing all sort of problems # (the dirname of $[0] is not the place where we might find the # original and so on. Autoconf is especially sensible to this). . ./$as_me.lineno # Exit status is that of the last command. exit } case `echo "testing\c"; echo 1,2,3`,`echo -n testing; echo 1,2,3` in *c*,-n*) ECHO_N= ECHO_C=' ' ECHO_T=' ' ;; *c*,* ) ECHO_N=-n ECHO_C= ECHO_T= ;; *) ECHO_N= ECHO_C='\c' ECHO_T= ;; esac if expr a : '\(a\)' >/dev/null 2>&1; then as_expr=expr else as_expr=false fi rm -f conf$$ conf$$.exe conf$$.file echo >conf$$.file if ln -s conf$$.file conf$$ 2>/dev/null; then # We could just check for DJGPP; but this test a) works b) is more generic # and c) will remain valid once DJGPP supports symlinks (DJGPP 2.04). if test -f conf$$.exe; then # Don't use ln at all; we don't have any links as_ln_s='cp -p' else as_ln_s='ln -s' fi elif ln conf$$.file conf$$ 2>/dev/null; then as_ln_s=ln else as_ln_s='cp -p' fi rm -f conf$$ conf$$.exe conf$$.file if mkdir -p . 2>/dev/null; then as_mkdir_p=: else test -d ./-p && rmdir ./-p as_mkdir_p=false fi as_executable_p="test -f" # Sed expression to map a string onto a valid CPP name. as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" # Sed expression to map a string onto a valid variable name. as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" # IFS # We need space, tab and new line, in precisely that order. as_nl=' ' IFS=" $as_nl" # CDPATH. $as_unset CDPATH exec 6>&1 # Open the log real soon, to keep \$[0] and so on meaningful, and to # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. Logging --version etc. is OK. exec 5>>config.log { echo sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX ## Running $as_me. ## _ASBOX } >&5 cat >&5 <<_CSEOF This file was extended by $as_me, which was generated by GNU Autoconf 2.59. Invocation command line was CONFIG_FILES = $CONFIG_FILES CONFIG_HEADERS = $CONFIG_HEADERS CONFIG_LINKS = $CONFIG_LINKS CONFIG_COMMANDS = $CONFIG_COMMANDS $ $0 $@ _CSEOF echo "on `(hostname || uname -n) 2>/dev/null | sed 1q`" >&5 echo >&5 _ACEOF # Files that config.status was made for. if test -n "$ac_config_files"; then echo "config_files=\"$ac_config_files\"" >>$CONFIG_STATUS fi if test -n "$ac_config_headers"; then echo "config_headers=\"$ac_config_headers\"" >>$CONFIG_STATUS fi if test -n "$ac_config_links"; then echo "config_links=\"$ac_config_links\"" >>$CONFIG_STATUS fi if test -n "$ac_config_commands"; then echo "config_commands=\"$ac_config_commands\"" >>$CONFIG_STATUS fi cat >>$CONFIG_STATUS <<\_ACEOF ac_cs_usage="\ \`$as_me' instantiates files from templates according to the current configuration. Usage: $0 [OPTIONS] [FILE]... -h, --help print this help, then exit -V, --version print version number, then exit -q, --quiet do not print progress messages -d, --debug don't remove temporary files --recheck update $as_me by reconfiguring in the same conditions --file=FILE[:TEMPLATE] instantiate the configuration file FILE Configuration files: $config_files Report bugs to ." _ACEOF cat >>$CONFIG_STATUS <<_ACEOF ac_cs_version="\\ config.status configured by $0, generated by GNU Autoconf 2.59, with options \\"`echo "$ac_configure_args" | sed 's/[\\""\`\$]/\\\\&/g'`\\" Copyright (C) 2003 Free Software Foundation, Inc. This config.status script is free software; the Free Software Foundation gives unlimited permission to copy, distribute and modify it." srcdir=$srcdir _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF # If no file are specified by the user, then we need to provide default # value. By we need to know if files were specified by the user. ac_need_defaults=: while test $# != 0 do case $1 in --*=*) ac_option=`expr "x$1" : 'x\([^=]*\)='` ac_optarg=`expr "x$1" : 'x[^=]*=\(.*\)'` ac_shift=: ;; -*) ac_option=$1 ac_optarg=$2 ac_shift=shift ;; *) # This is not an option, so the user has probably given explicit # arguments. ac_option=$1 ac_need_defaults=false;; esac case $ac_option in # Handling of the options. _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) ac_cs_recheck=: ;; --version | --vers* | -V ) echo "$ac_cs_version"; exit 0 ;; --he | --h) # Conflict between --help and --header { { echo "$as_me:$LINENO: error: ambiguous option: $1 Try \`$0 --help' for more information." >&5 echo "$as_me: error: ambiguous option: $1 Try \`$0 --help' for more information." >&2;} { (exit 1); exit 1; }; };; --help | --hel | -h ) echo "$ac_cs_usage"; exit 0 ;; --debug | --d* | -d ) debug=: ;; --file | --fil | --fi | --f ) $ac_shift CONFIG_FILES="$CONFIG_FILES $ac_optarg" ac_need_defaults=false;; --header | --heade | --head | --hea ) $ac_shift CONFIG_HEADERS="$CONFIG_HEADERS $ac_optarg" ac_need_defaults=false;; -q | -quiet | --quiet | --quie | --qui | --qu | --q \ | -silent | --silent | --silen | --sile | --sil | --si | --s) ac_cs_silent=: ;; # This is an error. -*) { { echo "$as_me:$LINENO: error: unrecognized option: $1 Try \`$0 --help' for more information." >&5 echo "$as_me: error: unrecognized option: $1 Try \`$0 --help' for more information." >&2;} { (exit 1); exit 1; }; } ;; *) ac_config_targets="$ac_config_targets $1" ;; esac shift done ac_configure_extra_args= if $ac_cs_silent; then exec 6>/dev/null ac_configure_extra_args="$ac_configure_extra_args --silent" fi _ACEOF cat >>$CONFIG_STATUS <<_ACEOF if \$ac_cs_recheck; then echo "running $SHELL $0 " $ac_configure_args \$ac_configure_extra_args " --no-create --no-recursion" >&6 exec $SHELL $0 $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion fi _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF for ac_config_target in $ac_config_targets do case "$ac_config_target" in # Handling of arguments. "Makefile" ) CONFIG_FILES="$CONFIG_FILES Makefile" ;; "rds-tools.spec" ) CONFIG_FILES="$CONFIG_FILES rds-tools.spec" ;; *) { { echo "$as_me:$LINENO: error: invalid argument: $ac_config_target" >&5 echo "$as_me: error: invalid argument: $ac_config_target" >&2;} { (exit 1); exit 1; }; };; esac done # If the user did not use the arguments to specify the items to instantiate, # then the envvar interface is used. Set only those that are not. # We use the long form for the default assignment because of an extremely # bizarre bug on SunOS 4.1.3. if $ac_need_defaults; then test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files fi # Have a temporary directory for convenience. Make it in the build tree # simply because there is no reason to put it here, and in addition, # creating and moving files from /tmp can sometimes cause problems. # Create a temporary directory, and hook for its removal unless debugging. $debug || { trap 'exit_status=$?; rm -rf $tmp && exit $exit_status' 0 trap '{ (exit 1); exit 1; }' 1 2 13 15 } # Create a (secure) tmp directory for tmp files. { tmp=`(umask 077 && mktemp -d -q "./confstatXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" } || { tmp=./confstat$$-$RANDOM (umask 077 && mkdir $tmp) } || { echo "$me: cannot create a temporary directory in ." >&2 { (exit 1); exit 1; } } _ACEOF cat >>$CONFIG_STATUS <<_ACEOF # # CONFIG_FILES section. # # No need to generate the scripts if there are no CONFIG_FILES. # This happens for instance when ./config.status config.h if test -n "\$CONFIG_FILES"; then # Protect against being on the right side of a sed subst in config.status. sed 's/,@/@@/; s/@,/@@/; s/,;t t\$/@;t t/; /@;t t\$/s/[\\\\&,]/\\\\&/g; s/@@/,@/; s/@@/@,/; s/@;t t\$/,;t t/' >\$tmp/subs.sed <<\\CEOF s,@SHELL@,$SHELL,;t t s,@PATH_SEPARATOR@,$PATH_SEPARATOR,;t t s,@PACKAGE_NAME@,$PACKAGE_NAME,;t t s,@PACKAGE_TARNAME@,$PACKAGE_TARNAME,;t t s,@PACKAGE_VERSION@,$PACKAGE_VERSION,;t t s,@PACKAGE_STRING@,$PACKAGE_STRING,;t t s,@PACKAGE_BUGREPORT@,$PACKAGE_BUGREPORT,;t t s,@exec_prefix@,$exec_prefix,;t t s,@prefix@,$prefix,;t t s,@program_transform_name@,$program_transform_name,;t t s,@bindir@,$bindir,;t t s,@sbindir@,$sbindir,;t t s,@libexecdir@,$libexecdir,;t t s,@datadir@,$datadir,;t t s,@sysconfdir@,$sysconfdir,;t t s,@sharedstatedir@,$sharedstatedir,;t t s,@localstatedir@,$localstatedir,;t t s,@libdir@,$libdir,;t t s,@includedir@,$includedir,;t t s,@oldincludedir@,$oldincludedir,;t t s,@infodir@,$infodir,;t t s,@mandir@,$mandir,;t t s,@build_alias@,$build_alias,;t t s,@host_alias@,$host_alias,;t t s,@target_alias@,$target_alias,;t t s,@DEFS@,$DEFS,;t t s,@ECHO_C@,$ECHO_C,;t t s,@ECHO_N@,$ECHO_N,;t t s,@ECHO_T@,$ECHO_T,;t t s,@LIBS@,$LIBS,;t t s,@VERSION@,$VERSION,;t t s,@RELEASE@,$RELEASE,;t t s,@LIBOBJS@,$LIBOBJS,;t t s,@LTLIBOBJS@,$LTLIBOBJS,;t t CEOF _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF # Split the substitutions into bite-sized pieces for seds with # small command number limits, like on Digital OSF/1 and HP-UX. ac_max_sed_lines=48 ac_sed_frag=1 # Number of current file. ac_beg=1 # First line for current file. ac_end=$ac_max_sed_lines # Line after last line for current file. ac_more_lines=: ac_sed_cmds= while $ac_more_lines; do if test $ac_beg -gt 1; then sed "1,${ac_beg}d; ${ac_end}q" $tmp/subs.sed >$tmp/subs.frag else sed "${ac_end}q" $tmp/subs.sed >$tmp/subs.frag fi if test ! -s $tmp/subs.frag; then ac_more_lines=false else # The purpose of the label and of the branching condition is to # speed up the sed processing (if there are no `@' at all, there # is no need to browse any of the substitutions). # These are the two extra sed commands mentioned above. (echo ':t /@[a-zA-Z_][a-zA-Z_0-9]*@/!b' && cat $tmp/subs.frag) >$tmp/subs-$ac_sed_frag.sed if test -z "$ac_sed_cmds"; then ac_sed_cmds="sed -f $tmp/subs-$ac_sed_frag.sed" else ac_sed_cmds="$ac_sed_cmds | sed -f $tmp/subs-$ac_sed_frag.sed" fi ac_sed_frag=`expr $ac_sed_frag + 1` ac_beg=$ac_end ac_end=`expr $ac_end + $ac_max_sed_lines` fi done if test -z "$ac_sed_cmds"; then ac_sed_cmds=cat fi fi # test -n "$CONFIG_FILES" _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF for ac_file in : $CONFIG_FILES; do test "x$ac_file" = x: && continue # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in". case $ac_file in - | *:- | *:-:* ) # input from stdin cat >$tmp/stdin ac_file_in=`echo "$ac_file" | sed 's,[^:]*:,,'` ac_file=`echo "$ac_file" | sed 's,:.*,,'` ;; *:* ) ac_file_in=`echo "$ac_file" | sed 's,[^:]*:,,'` ac_file=`echo "$ac_file" | sed 's,:.*,,'` ;; * ) ac_file_in=$ac_file.in ;; esac # Compute @srcdir@, @top_srcdir@, and @INSTALL@ for subdirectories. ac_dir=`(dirname "$ac_file") 2>/dev/null || $as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$ac_file" : 'X\(//\)[^/]' \| \ X"$ac_file" : 'X\(//\)$' \| \ X"$ac_file" : 'X\(/\)' \| \ . : '\(.\)' 2>/dev/null || echo X"$ac_file" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; } /^X\(\/\/\)[^/].*/{ s//\1/; q; } /^X\(\/\/\)$/{ s//\1/; q; } /^X\(\/\).*/{ s//\1/; q; } s/.*/./; q'` { if $as_mkdir_p; then mkdir -p "$ac_dir" else as_dir="$ac_dir" as_dirs= while test ! -d "$as_dir"; do as_dirs="$as_dir $as_dirs" as_dir=`(dirname "$as_dir") 2>/dev/null || $as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$as_dir" : 'X\(//\)[^/]' \| \ X"$as_dir" : 'X\(//\)$' \| \ X"$as_dir" : 'X\(/\)' \| \ . : '\(.\)' 2>/dev/null || echo X"$as_dir" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; } /^X\(\/\/\)[^/].*/{ s//\1/; q; } /^X\(\/\/\)$/{ s//\1/; q; } /^X\(\/\).*/{ s//\1/; q; } s/.*/./; q'` done test ! -n "$as_dirs" || mkdir $as_dirs fi || { { echo "$as_me:$LINENO: error: cannot create directory \"$ac_dir\"" >&5 echo "$as_me: error: cannot create directory \"$ac_dir\"" >&2;} { (exit 1); exit 1; }; }; } ac_builddir=. if test "$ac_dir" != .; then ac_dir_suffix=/`echo "$ac_dir" | sed 's,^\.[\\/],,'` # A "../" for each directory in $ac_dir_suffix. ac_top_builddir=`echo "$ac_dir_suffix" | sed 's,/[^\\/]*,../,g'` else ac_dir_suffix= ac_top_builddir= fi case $srcdir in .) # No --srcdir option. We are building in place. ac_srcdir=. if test -z "$ac_top_builddir"; then ac_top_srcdir=. else ac_top_srcdir=`echo $ac_top_builddir | sed 's,/$,,'` fi ;; [\\/]* | ?:[\\/]* ) # Absolute path. ac_srcdir=$srcdir$ac_dir_suffix; ac_top_srcdir=$srcdir ;; *) # Relative path. ac_srcdir=$ac_top_builddir$srcdir$ac_dir_suffix ac_top_srcdir=$ac_top_builddir$srcdir ;; esac # Do not use `cd foo && pwd` to compute absolute paths, because # the directories may not exist. case `pwd` in .) ac_abs_builddir="$ac_dir";; *) case "$ac_dir" in .) ac_abs_builddir=`pwd`;; [\\/]* | ?:[\\/]* ) ac_abs_builddir="$ac_dir";; *) ac_abs_builddir=`pwd`/"$ac_dir";; esac;; esac case $ac_abs_builddir in .) ac_abs_top_builddir=${ac_top_builddir}.;; *) case ${ac_top_builddir}. in .) ac_abs_top_builddir=$ac_abs_builddir;; [\\/]* | ?:[\\/]* ) ac_abs_top_builddir=${ac_top_builddir}.;; *) ac_abs_top_builddir=$ac_abs_builddir/${ac_top_builddir}.;; esac;; esac case $ac_abs_builddir in .) ac_abs_srcdir=$ac_srcdir;; *) case $ac_srcdir in .) ac_abs_srcdir=$ac_abs_builddir;; [\\/]* | ?:[\\/]* ) ac_abs_srcdir=$ac_srcdir;; *) ac_abs_srcdir=$ac_abs_builddir/$ac_srcdir;; esac;; esac case $ac_abs_builddir in .) ac_abs_top_srcdir=$ac_top_srcdir;; *) case $ac_top_srcdir in .) ac_abs_top_srcdir=$ac_abs_builddir;; [\\/]* | ?:[\\/]* ) ac_abs_top_srcdir=$ac_top_srcdir;; *) ac_abs_top_srcdir=$ac_abs_builddir/$ac_top_srcdir;; esac;; esac if test x"$ac_file" != x-; then { echo "$as_me:$LINENO: creating $ac_file" >&5 echo "$as_me: creating $ac_file" >&6;} rm -f "$ac_file" fi # Let's still pretend it is `configure' which instantiates (i.e., don't # use $as_me), people would be surprised to read: # /* config.h. Generated by config.status. */ if test x"$ac_file" = x-; then configure_input= else configure_input="$ac_file. " fi configure_input=$configure_input"Generated from `echo $ac_file_in | sed 's,.*/,,'` by configure." # First look for the input files in the build tree, otherwise in the # src tree. ac_file_inputs=`IFS=: for f in $ac_file_in; do case $f in -) echo $tmp/stdin ;; [\\/$]*) # Absolute (can't be DOS-style, as IFS=:) test -f "$f" || { { echo "$as_me:$LINENO: error: cannot find input file: $f" >&5 echo "$as_me: error: cannot find input file: $f" >&2;} { (exit 1); exit 1; }; } echo "$f";; *) # Relative if test -f "$f"; then # Build tree echo "$f" elif test -f "$srcdir/$f"; then # Source tree echo "$srcdir/$f" else # /dev/null tree { { echo "$as_me:$LINENO: error: cannot find input file: $f" >&5 echo "$as_me: error: cannot find input file: $f" >&2;} { (exit 1); exit 1; }; } fi;; esac done` || { (exit 1); exit 1; } _ACEOF cat >>$CONFIG_STATUS <<_ACEOF sed "$ac_vpsub $extrasub _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF :t /@[a-zA-Z_][a-zA-Z_0-9]*@/!b s,@configure_input@,$configure_input,;t t s,@srcdir@,$ac_srcdir,;t t s,@abs_srcdir@,$ac_abs_srcdir,;t t s,@top_srcdir@,$ac_top_srcdir,;t t s,@abs_top_srcdir@,$ac_abs_top_srcdir,;t t s,@builddir@,$ac_builddir,;t t s,@abs_builddir@,$ac_abs_builddir,;t t s,@top_builddir@,$ac_top_builddir,;t t s,@abs_top_builddir@,$ac_abs_top_builddir,;t t " $ac_file_inputs | (eval "$ac_sed_cmds") >$tmp/out rm -f $tmp/stdin if test x"$ac_file" != x-; then mv $tmp/out $ac_file else cat $tmp/out rm -f $tmp/out fi done _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF { (exit 0); exit 0; } _ACEOF chmod +x $CONFIG_STATUS ac_clean_files=$ac_clean_files_save # configure is writing to config.log, and then calls config.status. # config.status does its own redirection, appending to config.log. # Unfortunately, on DOS this fails, as config.log is still kept open # by configure, so config.status won't be able to write to it; its # output is simply discarded. So we exec the FD to /dev/null, # effectively closing config.log, so it can be properly (re)opened and # appended to by config.status. When coming back to configure, we # need to make the FD available again. if test "$no_create" != yes; then ac_cs_success=: ac_config_status_args= test "$silent" = yes && ac_config_status_args="$ac_config_status_args --quiet" exec 5>/dev/null $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false exec 5>>config.log # Use ||, not &&, to avoid exiting from the if with $? = 1, which # would make configure fail if this is the last instruction. $ac_cs_success || { (exit 1); exit 1; } fi trunk/rds-gen.10000644000175000017500000000540411313644724013174 0ustar benoitbenoit.Dd October 30, 2006 .Dt RDS-GEN-SINK 1 .Os .Sh NAME .Nm rds-gen .Nd write data from a file to an RDS socket .Pp .Nm rds-sink .Nd write data from an RDS socket to a file .Sh SYNOPSIS .Nm rds-gen .Bk -words .Op Fl s Ar source_address:source_port .Op Fl d Ar destination_address:destination_port .Op Fl f Ar input_file .Op Fl m Ar message_size .Op Fl l Ar total_bytes .Op Fl i Ar interval .Nm rds-sink .Bk -words .Op Fl s Ar listen_address:listen_port .Op Fl f Ar output_file .Op Fl i Ar interval .Sh DESCRIPTION The .Nm and .Nm rds-sink utilities are used to stream data through RDS sockets. rds-gen reads data from a file descriptor and sends it as messages down an RDS socket. rds-sink receives messages from an RDS socket and writes it to a file descriptor. The following options are shared between rds-gen and rds-sink: .Bl -tag -width Ds .It Fl s Ar address:port Binds the RDS socket to the given address and port. rds-gen will send messages from this address and port. rds-sink will receive messages sent to this address and port. .It Fl f Ar file rds-gen will read data from this file and rds-sink will write data to this file. If '-' is given as the filename then rds-gen will use standard input and rds-sink will use standard output. .It Fl i Ar interval_seconds An iterative summary of the number and size of messages that are sent and received is written to standard error at this interval. .El .Pp In addition, rds-gen supports the following options: .Bl -tag -width Ds .It Fl d Ar address:port Messages are sent to this destination address and port. If this option is specified multiple times then the messages are sent to each destination address in a round-robin fashion. .It Fl m Ar message_size Specifies the size of the messages that are sent down the RDS socket. The default message size is 4k. The message size must not be greater than the buffer size. .It Fl l Ar total_bytes Specifies the number of bytes that will be sent out the socket before rds-gen exits. If this is not specified and rds-gen was given a source file then it will run until it gets EOF from the file. If no file was given and this option is not specified then rds-gen will send data indefinitely. .El .Pp .Sh EXAMPLES rds-gen on host src sends infinite data to rds-sink on dest who prints out the amount of data it receives every second. .Pp .Dl $ rds-sink -s dest:22222 -i 1 .Dl $ rds-gen -s src:11111 -d dest:22222 .Pp Read 100M from /dev/zero on src and write it to /dev/null on dest, printing stats on both sides every minute. .Dl $ rds-sink -s dest:22222 -f /dev/null -i 60 .Dl $ rds-gen -s src:11111 -f /dev/zero -d dest:22222 -i 60 .Pp Watch rds-gen write data as fast as it can into a local black hole because there is no bound receiving socket. .Dl $ rds-gen -s src:11111 -d localhost:31337 -i 1 .Pp trunk/rds-gen.c0000644000175000017500000001567411313644724013270 0ustar benoitbenoit/* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ /* -*- mode: c; c-basic-offset: 8; -*- * vim: noexpandtab sw=8 ts=8 sts=0: * * rds-gen.c: Spew some RDS packets */ #define _LARGEFILE64_SOURCE #include #include #include #include #include #include #include #include #include #include #include "kernel-list.h" #include "rdstool.h" void print_usage(int rc) { int namelen = strlen(progname); FILE *output = rc ? stderr : stdout; verbosef(0, output, "Usage: %s -s : [[-d :] ...]\n" " %*s [-f ] [-m ]\n" " %*s [-l ] [-i ]\n" " %*s [-v ...] [-q ...]\n" " %s -h\n" " %s -V\n", progname, namelen, "", namelen, "", namelen, "", progname, progname); exit(rc); } void print_version() { verbosef(0, stdout, "%s version VERSION\n", progname); exit(0); } /* * Pick the next destination. * Currently round-robin, but could be made fancy */ static struct rds_endpoint *pick_dest(struct rds_context *ctxt, struct rds_endpoint *de) { struct list_head *next; if (!de || (de->re_item.next == &ctxt->rc_daddrs)) next = ctxt->rc_daddrs.next; else next = de->re_item.next; return list_entry(next, struct rds_endpoint, re_item); } static ssize_t fill_stdin(struct rds_context *ctxt, char *bytes, ssize_t len) { ssize_t ret = 0; char *ptr = bytes; static int first = 1; if (!first) return ret; if (ctxt->rc_filename && strcmp(ctxt->rc_filename,"-")) first = 0; while (len && runningp()) { stats_print(); ret = read(STDIN_FILENO, ptr, len); if (!ret) { if (ptr != bytes) { verbosef(0, stderr, "%s: Unexpected end of file reading from %s\n", progname, ctxt->rc_filename); ret = -EPIPE; } break; } if (ret < 0) { ret = -errno; if (ret == -EINTR) continue; verbosef(0, stderr, "%s: Error reading from %s: %s\n", progname, ctxt->rc_filename, strerror(-ret)); break; } stats_add_read(ret); ptr += ret; len -= ret; ret = 0; /* If this filled the buffer, we return success */ } verbosef(3, stderr, "Read %zd bytes from stdin\n", ptr - bytes); return ret; } static ssize_t fill_pattern(struct rds_context *ctxt, char *bytes, ssize_t len) { static int first = 1; stats_print(); if (first) { memset(bytes, 0, len); first = 0; } return 0; } static ssize_t fill_buff(struct rds_context *ctxt, char *bytes, ssize_t len) { ssize_t ret; /* Each possible method must handle calling stats_print() */ if (ctxt->rc_filename) ret = fill_stdin(ctxt, bytes, len); else ret = fill_pattern(ctxt, bytes, len); return ret; } static ssize_t send_buff(struct rds_endpoint *se, struct msghdr *msg) { ssize_t ret = 0; while (runningp()) { stats_print(); ret = sendmsg(se->re_fd, msg, 0); if (ret < 0) { ret = -errno; if (ret == -EINTR) continue; verbosef(0, stderr, "%s: Error from sendmsg: %s\n", progname, strerror(-ret)); } /* Success */ break; } return ret; } static int wli_do_send(struct rds_context *ctxt) { char bytes[ctxt->rc_msgsize]; int ret = 0; struct rds_endpoint *de = NULL, *se = ctxt->rc_saddr; struct iovec iov = { .iov_base = bytes, .iov_len = ctxt->rc_msgsize, }; struct msghdr msg = { .msg_name = NULL, /* Picked later */ .msg_namelen = sizeof(struct sockaddr_in), .msg_iov = &iov, .msg_iovlen = 1, .msg_control = NULL, .msg_controllen = 0, .msg_flags = 0, }; verbosef(2, stderr, "Starting send loop\n"); stats_start(); while (runningp()) { /* Calls stats_print() */ ret = fill_buff(ctxt, bytes, ctxt->rc_msgsize); if (ret) { if (ret == -EINTR) continue; else break; } de = pick_dest(ctxt, de); verbosef(2, stderr, "Destination %s\n", de->re_name); msg.msg_name = &de->re_addr; if (ctxt->rc_total && ((stats_get_send() + ctxt->rc_msgsize) > ctxt->rc_total)) iov.iov_len = ctxt->rc_total - stats_get_send(); /* Calls stats_print() */ ret = send_buff(se, &msg); if (ret < 0) break; stats_add_send(ret); if (ctxt->rc_total && (stats_get_send() >= ctxt->rc_total)) break; } verbosef(2, stderr, "Stopping send loop\n"); stats_total(); return ret; } int main(int argc, char *argv[]) { int rc; char ipbuf[INET_ADDRSTRLEN]; struct rds_endpoint *e; struct rds_context ctxt = { .rc_msgsize = RDS_DEFAULT_MSG_SIZE, }; INIT_LIST_HEAD(&ctxt.rc_daddrs); rc = parse_options(argc, argv, RDS_TOOL_BASE_OPTS RDS_GEN_OPTS, &ctxt); if (rc) print_usage(rc); if (list_empty(&ctxt.rc_daddrs)) { verbosef(0, stderr, "%s: Destination endpoint address required\n", progname); print_usage(-EINVAL); } inet_ntop(PF_INET, &ctxt.rc_saddr->re_addr.sin_addr, ipbuf, INET_ADDRSTRLEN); verbosef(2, stderr, "Binding endpoint %s:%d\n", ipbuf, ntohs(ctxt.rc_saddr->re_addr.sin_port)); rc = rds_bind(&ctxt); if (rc) goto out; if (ctxt.rc_filename) { rc = dup_file(&ctxt, STDIN_FILENO, O_RDONLY); if (rc) goto out; if (!strcmp(ctxt.rc_filename, "-")) ctxt.rc_filename = ""; } list_for_each_entry(e, &ctxt.rc_daddrs, re_item) { inet_ntop(PF_INET, &e->re_addr.sin_addr, ipbuf, INET_ADDRSTRLEN); verbosef(2, stderr, "Adding destination %s:%d\n", ipbuf, ntohs(e->re_addr.sin_port)); } rc = setup_signals(); if (rc) { verbosef(0, stderr, "%s: Unable to initialize signals\n", progname); goto out; } rc = wli_do_send(&ctxt); out: free(ctxt.rc_saddr->re_name); free(ctxt.rc_saddr); return rc; } trunk/kernel-list.h0000644000175000017500000001237711313644724014164 0ustar benoitbenoit/* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H /* * Simple doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ struct list_head { struct list_head *next, *prev; }; #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ struct list_head name = { &name, &name } #define INIT_LIST_HEAD(ptr) do { \ (ptr)->next = (ptr); (ptr)->prev = (ptr); \ } while (0) #if (!defined(__GNUC__) && !defined(__WATCOMC__)) #define __inline__ #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static __inline__ void __list_add(struct list_head * new, struct list_head * prev, struct list_head * next) { next->prev = new; new->next = next; new->prev = prev; prev->next = new; } /* * Insert a new entry after the specified head.. */ static __inline__ void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /* * Insert a new entry at the tail */ static __inline__ void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static __inline__ void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; prev->next = next; } static __inline__ void list_del(struct list_head *entry) { __list_del(entry->prev, entry->next); } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init(struct list_head *entry) { __list_del(entry->prev, entry->next); INIT_LIST_HEAD(entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move(struct list_head *list, struct list_head *head) { __list_del(list->prev, list->next); list_add(list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del(list->prev, list->next); list_add_tail(list, head); } static __inline__ int list_empty(struct list_head *head) { return head->next == head; } /* * Splice in "list" into "head" */ static __inline__ void list_splice(struct list_head *list, struct list_head *head) { struct list_head *first = list->next; if (first != list) { struct list_head *last = list->prev; struct list_head *at = head->next; first->prev = head; head->next = first; last->next = at; at->prev = last; } } #define list_entry(ptr, type, member) \ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) #define list_for_each(pos, head) \ for (pos = (head)->next; pos != (head); pos = pos->next) #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; pos != (head); \ pos = n, n = pos->next) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop counter. * @head: the head for your list. * @member: the name of the list_struct within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_entry((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) #endif trunk/configure.in0000644000175000017500000000017211313644724014061 0ustar benoitbenoitAC_PREREQ(2.55) AC_INIT() VERSION=1.4 RELEASE=1 AC_SUBST(VERSION) AC_SUBST(RELEASE) AC_OUTPUT(Makefile rds-tools.spec) trunk/stats.c0000644000175000017500000001253011313644724013053 0ustar benoitbenoit/* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ /* -*- mode: c; c-basic-offset: 8; -*- * vim: noexpandtab sw=8 ts=8 sts=0: * * stats.c - Print stats at an interval */ #include #include #include #include #include #include #include #include #include #include #include #include "kernel-list.h" #include "rdstool.h" static int stats_delay = 0; /* Delay in seconds */ static int print_extended = 0; /* Print read/write stats? */ static sig_atomic_t time_to_print = 0; struct rds_tool_stats { uint64_t rs_send_bytes; uint64_t rs_send_bytes_interval; uint64_t rs_send_packets; uint64_t rs_send_packets_interval; uint64_t rs_recv_bytes; uint64_t rs_recv_bytes_interval; uint64_t rs_recv_packets; uint64_t rs_recv_packets_interval; uint64_t rs_read_bytes; uint64_t rs_read_bytes_interval; uint64_t rs_write_bytes; uint64_t rs_write_bytes_interval; } tool_stats; #define inc_net_stat(type, val) do { \ tool_stats.rs_##type##_bytes += val; \ tool_stats.rs_##type##_bytes_interval += val; \ tool_stats.rs_##type##_packets += 1; \ tool_stats.rs_##type##_packets_interval += 1; \ } while (0) #define inc_io_stat(type, val) do { \ tool_stats.rs_##type##_bytes += val; \ tool_stats.rs_##type##_bytes_interval += val; \ } while (0) #define clear_interval() do { \ tool_stats.rs_send_bytes_interval = 0; \ tool_stats.rs_recv_bytes_interval = 0; \ tool_stats.rs_send_packets_interval = 0; \ tool_stats.rs_recv_packets_interval = 0; \ tool_stats.rs_read_bytes_interval = 0; \ tool_stats.rs_write_bytes_interval = 0; \ } while (0) static void handler(int signum) { time_to_print = 1; } static int setup_alarm(void) { int rc = 0; struct sigaction act; sigemptyset(&act.sa_mask); act.sa_handler = handler; act.sa_flags = 0; rc = sigaction(SIGALRM, &act, NULL); if (rc) { rc = -errno; verbosef(0, stderr, "%s: Unable to initialize timer: %s\n", progname, strerror(-rc)); } return rc; } void stats_add_read(uint64_t bytes) { inc_io_stat(read, bytes); } void stats_add_write(uint64_t bytes) { inc_io_stat(write, bytes); } void stats_add_send(uint64_t bytes) { inc_net_stat(send, bytes); } uint64_t stats_get_send(void) { return tool_stats.rs_send_bytes; } void stats_add_recv(uint64_t bytes) { inc_net_stat(recv, bytes); } static void stats_arm(void) { time_to_print = 0; alarm(stats_delay); } int stats_init(int delay) { int rc = 0; stats_delay = delay; if (stats_delay) rc = setup_alarm(); return rc; } void stats_extended(int extendedp) { print_extended = !!extendedp; } void stats_start(void) { if (stats_delay) { verbosef(1, stderr, "%19s %19s %19s %19s\n", "Bytes sent/s", "Packets sent/s", "Bytes recv/s", "Packets recv/s"); if (print_extended) verbosef(1, stderr, " %19s %19s", "Bytes read/s", "Bytes written/s"); verbosef(1, stderr, "\n"); stats_arm(); } } static void stats_output(void) { verbosef(0, stderr, "%19"PRIu64" %19"PRIu64" %19"PRIu64" %19"PRIu64, tool_stats.rs_send_bytes_interval / stats_delay, tool_stats.rs_send_packets_interval / stats_delay, tool_stats.rs_recv_bytes_interval / stats_delay, tool_stats.rs_recv_packets_interval / stats_delay); if (print_extended) verbosef(0, stderr, " %19"PRIu64" %19"PRIu64, tool_stats.rs_read_bytes_interval / stats_delay, tool_stats.rs_write_bytes_interval / stats_delay); verbosef(0, stderr, "\n"); } void stats_print(void) { /* Are stats on? */ if (stats_delay && time_to_print) { stats_output(); clear_interval(); stats_arm(); } } void stats_total(void) { if (!stats_delay) return; verbosef(0, stderr, "Total:\n" "%19"PRIu64" %19"PRIu64" %19"PRIu64" %19"PRIu64, tool_stats.rs_send_bytes, tool_stats.rs_send_packets, tool_stats.rs_recv_bytes, tool_stats.rs_recv_packets); if (print_extended) verbosef(0, stderr, " %19"PRIu64" %19"PRIu64, tool_stats.rs_read_bytes, tool_stats.rs_write_bytes); verbosef(0, stderr, "\n"); } trunk/Makefile.in0000644000175000017500000000501311313644724013614 0ustar benoitbenoitprefix = $(DESTDIR)@prefix@ exec_prefix = $(DESTDIR)@exec_prefix@ bindir = $(DESTDIR)@bindir@ mandir = $(DESTDIR)@mandir@ incdir = $(DESTDIR)@includedir@ all: all-programs CFLAGS = -O2 -Wall CPPFLAGS = -DDEBUG_EXE -MD -MP -MF $(@D)/.$(basename $(@F)).d HEADERS = kernel-list.h rdstool.h pfhack.h net/rds.h net/ib_rds.h COMMON_SOURCES = options.c stats.c pfhack.c SOURCES = $(addsuffix .c,$(PROGRAMS)) $(COMMON_SOURCES) CLEAN_OBJECTS = $(addsuffix .o,$(PROGRAMS)) $(subst .c,.o,$(COMMON_SOURCES)) # This is the default DYNAMIC_PF_RDS = true ifneq ($(DYNAMIC_PF_RDS),) CPPFLAGS += -DDYNAMIC_PF_RDS COMMON_OBJECTS = $(subst .c,.o,$(COMMON_SOURCES)) else COMMON_OBJECTS = $(subst .c,.o,$(filter-out pfhack.c,$(COMMON_SOURCES))) endif PROGRAMS = rds-gen rds-sink rds-info rds-stress rds-ping all-programs: $(PROGRAMS) install: $(PROGRAMS) install -d $(bindir) install -m 555 -s $(PROGRAMS) $(bindir) install -d $(mandir)/man1 install -d $(mandir)/man7 install -m 644 *.1 $(mandir)/man1 install -m 644 *.7 $(mandir)/man7 install -d $(incdir)/net install -m 444 net/rds.h $(incdir)/net clean: rm -f $(PROGRAMS) $(CLEAN_OBJECTS) distclean: clean rm -f .*.d $(PROGRAMS) : % : %.o $(COMMON_OBJECTS) gcc $(CFLAGS) $(LDFLAGS) -o $@ $^ LOCAL_DFILES := $(wildcard .*.d) ifneq ($(LOCAL_DFILES),) .PHONY: $(LOCAL_DFILES) -include $(LOCAL_DFILES) endif VERSION := @VERSION@ RELEASE := @RELEASE@ TAR_PREFIX := rds-tools-$(VERSION)-$(RELEASE) TAR_FILE := $(TAR_PREFIX).tar.gz EXTRA_DIST := rds-info.1 \ rds-gen.1 \ rds-sink.1 \ rds-stress.1 \ rds-ping.1 \ rds.7 \ rds-rdma.7 \ Makefile.in \ rds-tools.spec.in \ configure.in \ configure \ README \ rds-tools.txt \ stap/rds.stp \ stap/README \ docs/rds-architecture.txt \ examples/Makefile \ examples/rds-sample.c \ examples/README DISTFILES := $(SOURCES) $(HEADERS) $(EXTRA_DIST) $(TAR_FILE): Makefile rds-tools.spec @rm -rf $@ $(TAR_PREFIX) || : @mkdir $(TAR_PREFIX) for a in $^ $(DISTFILES); do \ if [ ! -f $$a ]; then \ continue; \ fi; \ targ=$(TAR_PREFIX)/$$(dirname $$a); \ mkdir -p $$targ; \ cp $$a $$targ; \ done tar -zcf $@ $(TAR_PREFIX) .PHONY: rpm rpm: $(TAR_FILE) rpmbuild -ta $^ .PHONY: dist dist: $(TAR_FILE) trunk/options.c0000644000175000017500000002237111313644724013414 0ustar benoitbenoit/* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ /* -*- mode: c; c-basic-offset: 8; -*- * vim: noexpandtab sw=8 ts=8 sts=0: * * options.c - options and stuff */ #define _LARGEFILE64_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "kernel-list.h" #include "rdstool.h" /* This gets changed in parse_options() */ char *progname = "rds-generic-tool"; unsigned int verbose = 1; sig_atomic_t running = 1; /* * Take "address:port" and return a sockaddr(_in) that describes it. * Since RDS is IPv4 only, we don't worry about PF_INET6. * * XXX: Should we try a default IP or default port? RDS is very * endpoint-oriented; right now we require explicitness. * * Since getaddrinfo(3) returns multiple addresses, we simply find the * first SOCK_DGRAM AF_INET result. Note that RDS actually uses * SOCK_SEQPACKET, but we're lying to getaddrinfo(3). */ static int parse_endpoint(struct rds_endpoint *nep) { int rc; char *host, *port; struct addrinfo *list, *try; struct addrinfo hint = { .ai_family = PF_INET, .ai_socktype = SOCK_DGRAM, }; host = strdup(nep->re_name); if (!host) { rc = -ENOMEM; verbosef(0, stderr, "%s: Unable to allocate memory\n", progname); goto out; } port = strchr(host, ':'); if (!port) { rc = -EINVAL; verbosef(0, stderr, "%s: Invalid endpoint: %s\n", progname, nep->re_name); goto out; } *port = '\0'; port++; rc = getaddrinfo(host, port, &hint, &list); if (rc) { verbosef(0, stderr, "%s: Unable to resolve \"%s\": %s\n", progname, nep->re_name, gai_strerror(rc)); goto out; } for (try = list; try; try = try->ai_next) { if ((try->ai_family == PF_INET) && (try->ai_socktype == SOCK_DGRAM)) break; } if (try) { if (try->ai_addrlen != sizeof(struct sockaddr_in)) verbosef(0, stderr, "%s: OMG WTF BBQ! try->ai_addrlen = %d, sizeof(struct sockaddr_in) = %zd\n", progname, try->ai_addrlen, sizeof(struct sockaddr_in)); memcpy(&nep->re_addr, try->ai_addr, try->ai_addrlen); } if (list) freeaddrinfo(list); out: return rc; } static int add_endpoint(const char *endpoint, struct list_head *list) { int rc; struct rds_endpoint *nep; nep = malloc(sizeof(struct rds_endpoint)); if (!nep) return -ENOMEM; nep->re_name = strdup(endpoint); if (!nep->re_name) { free(nep); return -ENOMEM; } rc = parse_endpoint(nep); if (!rc) { list_add_tail(&nep->re_item, list); } else { free(nep->re_name); free(nep); } return rc; } static int get_number(char *arg, uint64_t *res) { char *ptr = NULL; uint64_t num; num = strtoull(arg, &ptr, 0); if ((ptr == arg) || (num == UINT64_MAX)) return(-EINVAL); switch (*ptr) { case '\0': break; case 'g': case 'G': num *= 1024; /* FALL THROUGH */ case 'm': case 'M': num *= 1024; /* FALL THROUGH */ case 'k': case 'K': num *= 1024; /* FALL THROUGH */ case 'b': case 'B': break; default: return -EINVAL; } *res = num; return 0; } extern char *optarg; extern int optopt; extern int optind; extern int opterr; int parse_options(int argc, char *argv[], const char *opts, struct rds_context *ctxt) { int c, rc = 0; uint64_t val; struct list_head saddrs; if (argc && argv[0]) progname = basename(argv[0]); INIT_LIST_HEAD(&saddrs); opterr = 0; while ((c = getopt(argc, argv, opts)) != EOF) { switch (c) { case 's': if (!list_empty(&saddrs)) { verbosef(0, stderr, "%s: Only one source address allowed\n", progname); rc = -EINVAL; } else rc = add_endpoint(optarg, &saddrs); break; case 'd': rc = add_endpoint(optarg, &ctxt->rc_daddrs); break; case 'm': rc = get_number(optarg, &val); if (rc) { verbosef(0, stderr, "%s: Invalid number: %s\n", progname, optarg); break; } if (val > UINT32_MAX) { rc = -EINVAL; verbosef(0, stderr, "%s: Message size too large: %"PRIu64"\n", progname, val); } else ctxt->rc_msgsize = (uint32_t)val; break; case 'l': rc = get_number(optarg, &ctxt->rc_total); if (rc) { verbosef(0, stderr, "%s: Invalid number: %s\n", progname, optarg); } break; case 'f': ctxt->rc_filename = optarg; stats_extended(1); break; case 'i': rc = get_number(optarg, &val); if (rc) { verbosef(0, stderr, "%s: Invalid number: %s\n", progname, optarg); break; } if (val > LONG_MAX) { rc = -EINVAL; verbosef(0, stderr, "%s: Sleep interval too large: %"PRIu64"\n", progname, val); } else { rc = stats_init((long)val); } break; case 'v': verbose++; break; case 'q': if (verbose) verbose--; break; case 'V': print_version(); break; case 'h': print_usage(0); break; case '-': if (!strcmp(optarg, "help")) print_usage(0); else if (!strcmp(optarg, "version")) print_version(); else { rc = -EINVAL; verbosef(0, stderr, "%s: Invalid argument: \'--%s\'\n", progname, optarg); } break; case '?': verbosef(0, stderr, "%s: Invalid option \'-%c\'\n", progname, optopt); rc = -EINVAL; break; case ':': verbosef(0, stderr, "%s: Option \'-%c\' requires an argument\n", progname, optopt); rc = -EINVAL; break; default: verbosef(0, stderr, "%s: Shouldn't get here %c %c\n", progname, optopt, c); rc = -EINVAL; break; } if (rc) goto out; } if (list_empty(&saddrs)) { verbosef(0, stderr, "%s: Source endpoint address required\n", progname); rc = -EINVAL; goto out; } ctxt->rc_saddr = list_entry(saddrs.prev, struct rds_endpoint, re_item); out: return rc; } int rds_bind(struct rds_context *ctxt) { int rc; struct rds_endpoint *e = ctxt->rc_saddr; rc = socket(PF_RDS, SOCK_SEQPACKET, 0); if (rc < 0) { rc = -errno; verbosef(0, stderr, "%s: Unable to create socket: %s\n", progname, strerror(-rc)); goto out; } e->re_fd = rc; rc = bind(e->re_fd, (struct sockaddr *)&e->re_addr, sizeof(struct sockaddr_in)); if (rc) { rc = -errno; verbosef(0, stderr, "%s: Unable to bind socket: %s\n", progname, strerror(-rc)); close(e->re_fd); e->re_fd = -1; goto out; } out: return rc; } int dup_file(struct rds_context *ctxt, int fd, int flags) { int tmp_fd, rc = 0; char *type; /* "-" is stdin/stdout */ if (!strcmp(ctxt->rc_filename, "-")) goto out; tmp_fd = open64(ctxt->rc_filename, flags); if (tmp_fd < 0) { rc = -errno; verbosef(0, stderr, "%s: Unable to open file \"%s\": %s\n", progname, ctxt->rc_filename, strerror(-rc)); goto out; } if (tmp_fd != fd) { rc = dup2(tmp_fd, fd); if (rc < 0) { rc = -errno; switch (fd) { case STDIN_FILENO: type = "stdin"; break; case STDOUT_FILENO: type = "stdout"; break; case STDERR_FILENO: type = "stderr"; break; default: type = "random fd"; break; } verbosef(0, stderr, "%s: Unable to set file \"%s\" as %s: %s\n", progname, ctxt->rc_filename, type, strerror(-rc)); } else if (rc != fd) { verbosef(0, stderr, "%s: dup2(2) failed for some reason!\n", progname); rc = -EBADF; } else rc = 0; } out: return rc; } int runningp(void) { return running; } void handler(int signum) { running = 0; } int setup_signals(void) { int rc = -EINVAL; struct sigaction act; sigemptyset(&act.sa_mask); act.sa_handler = handler; act.sa_flags = 0; if (sigaction(SIGTERM, &act, NULL)) goto out; if (sigaction(SIGINT, &act, NULL)) goto out; act.sa_handler = SIG_IGN; if (sigaction(SIGPIPE, &act, NULL)) /* Get EPIPE instead */ goto out; rc = 0; out: return rc; } trunk/rds-tools.spec0000644000175000017500000000136411313644724014356 0ustar benoitbenoitSummary: RDS support tools Name: rds-tools Version: 1.4 Release: 1 License: GPL/BSD Group: Applications/Internet URL: http://oss.oracle.com/projects/rds/ Source: rds-tools-%{version}-%{release}.tar.gz BuildRoot: /var/tmp/rds-tools-%{version}-%{release} %description rds-tools is a collection of support tools for the RDS socket API. %prep %setup -n rds-tools-%{version}-%{release} %build %configure make %{?_smp_mflags} %install rm -rf $RPM_BUILD_ROOT make DESTDIR=$RPM_BUILD_ROOT install %clean rm -rf $RPM_BUILD_ROOT %files %defattr(-,root,root) %{_bindir}/* %{_mandir}/* %{_includedir}/* %changelog * Sun Nov 25 2007 Vladimir Sokolovsky - Use DESTDIR * Mon Oct 27 2006 Zach Brown - initial version trunk/rds-info.10000644000175000017500000001175511313644724013364 0ustar benoitbenoit.Dd October 30, 2006 .Dt RDS-INFO 1 .Os .Sh NAME .Nm rds-info .Nd display information from the RDS kernel module .Pp .Sh SYNOPSIS .Nm rds-info .Op Fl v .Bk -words .Op Fl cknrstIT .Sh DESCRIPTION The .Nm utility presents various sources of information that the RDS kernel module maintains. When run without any optional arguments .Nm will output all the information it knows of. When options are specified then only the information associated with those options is displayed. The options are as follows: .Bl -tag -width Ds .It Fl v Requests verbose output. When this option is given, some classes of information will display additional data. .It Fl c Display global counters. Each counter increments as its event occurs. The counters may not be reset. The set of supported counters may change over time. .Bl -tag -width 4 .It CounterName The name of the counter. These names come from the kernel and can change depending on the capability of the kernel module. .It Value The number of times that the counter has been incremented since the kernel module was loaded. .El .It Fl k Display all the RDS sockets in the system. There will always be one socket listed that is neither bound to nor connected to any addresses because .Nm itself uses an unbound socket to collect information. .Bl -tag -width 4 .It BoundAddr, BPort The IP address and port that the socket is bound to. 0.0.0.0 0 indicates that the socket has not been bound. .It ConnAddr, CPort The IP address and port that the socket is connected to. 0.0.0.0 0 indicates that the socket has not been connected. .It SndBuf, RcvBuf The number of bytes of message payload which can be queued for sending or receiving on the socket, respectively. .It Inode The number of the inode object associated with the socket. Can be used to locate the process owning a given socket by searching /proc/*/fd for open files referencing a socket with this inode number. .El .It Fl n Display all RDS connections. RDS connections are maintained between nodes by transports. .Bl -tag -width 4 .It LocalAddr The IP address of this node. For connections that originate and terminate on the same node the local address indicates which address initiated the connection establishment. .It RemoteAddr The IP address of the remote end of the connection. .It NextTX The sequence number that will be given to the next message that is sent over the connection. .It NextRX The sequence number that is expected from the next message to arrive over the connection. Any incoming messages with sequence numbers less than this will be dropped. .It Flg Flags which indicate the state of the connection. .Bl -tag -width 4 .It s A process is currently sending a message down the connection. .It c The transport is attempting to connect to the remote address. .It C The connection to the remote host is connected and active. .El .El .It Fl r, Fl s, Fl t Display the messages in the receive, send, or retransmit queues respectively. .Bl -tag -width 4 .It LocalAddr, LPort The local IP address and port on this node associated with the message. For sent messages this is the source address, for receive messages it is the destination address. .It RemoteAddr, RPort The remote IP address and port associated with the message. For sent messages this is the destination address, for receive messages it is the source address. .It Seq The sequence number of the message. .It Bytes The number of bytes in the message payload. .El The following information sources are dependent on specific transports which may not always be available. .It Fl I Display the IB connections which the IB transport is using to provide RDS connections. .Bl -tag -width 4 .It LocalAddr The local IP address of this connection. .It RemoteAddr The remote IP address of this connection. .It LocalDev The local IB Global Identifier, printed in IPv6 address syntax. .It RemoteDev The remote IB Global Identifier, printed in IPv6 address syntax. .El If verbose output is requested, per-connection settings such as the maximum number of send and receive work requests will be displayed in addition. .It Fl T Display the TCP sockets which the TCP transport is using to provide RDS connections. .Bl -tag -width 4 .It LocalAddr, LPort The local IP address and port of this socket. .It RemoteAddr, RPort The remote IP address and port that this socket is connected to. .It HdrRemain The number of bytes that must be read off the socket to complete the next full RDS header. .It DataRemain The number of bytes that must be read off the socket to complete the data payload of the message which is being received. .It SentNxt The TCP sequence number of the first byte of the last message that we sent down the connection. .It ExpectedUna The TCP sequence number of the byte past the last byte of the last message that we sent down the connection. When we see that the remote side has acked up to this byte then we know that the remote side has received all our RDS messages. .It SeenUna The TCP sequence number of the byte past the last byte which has been acked by the remote host. .El .El .Pp trunk/rds-info.c0000644000175000017500000002232511313644724013441 0ustar benoitbenoit/* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ /* -*- mode: c; c-basic-offset: 8; -*- * vim: noexpandtab sw=8 ts=8 sts=0: */ #include #include #include #include #include #include #include #include #include #include #include #include "net/rds.h" #include "rdstool.h" #define rds_conn_flag(conn, flag, letter) \ (conn.flags & RDS_INFO_CONNECTION_FLAG_##flag ? letter : '-') #define min(a, b) (a < b ? a : b) #define array_size(foo) (sizeof(foo) / sizeof(foo[0])) #define copy_into(var, data, each) ({ \ int __ret = 1; \ memset(&var, 0, sizeof(var)); \ memcpy(&var, data, min(each, sizeof(var))); \ __ret; \ }) #define for_each(var, data, each, len) \ for (;len > 0 && copy_into(var, data, each); \ data += each, len -= min(len, each)) static int opt_verbose = 0; /* Like inet_ntoa, but can be re-entered several times without clobbering * the previously returned string. */ static const char *paddr(int af, const void *addrp) { static char nbuf[8][INET6_ADDRSTRLEN]; static int which = 0; char *string; string = nbuf[which]; which = (which + 1) % 8; inet_ntop(af, addrp, string, INET6_ADDRSTRLEN); return string; } static const char *ipv4addr(uint32_t addr) { return paddr(AF_INET, &addr); } static const char *ipv6addr(const void *addr) { return paddr(AF_INET6, addr); } static void print_counters(void *data, int each, socklen_t len, void *extra) { struct rds_info_counter ctr; printf("\nCounters:\n%25s %16s\n", "CounterName", "Value"); for_each(ctr, data, each, len) printf("%25s %16"PRIu64"\n", ctr.name, ctr.value); } static void print_sockets_v1(void *data, int each, socklen_t len, void *extra) { struct rds_info_socket_v1 sk; printf("\nRDS Sockets:\n%15s %5s %15s %5s %10s %10s\n", "BoundAddr", "BPort", "ConnAddr", "CPort", "SndBuf", "RcvBuf"); for_each(sk, data, each, len) { printf("%15s %5u %15s %5u %10u %10u\n", ipv4addr(sk.bound_addr), ntohs(sk.bound_port), ipv4addr(sk.connected_addr), ntohs(sk.connected_port), sk.sndbuf, sk.rcvbuf); } } static void print_sockets(void *data, int each, socklen_t len, void *extra) { struct rds_info_socket sk; if (each == sizeof(struct rds_info_socket_v1)) { print_sockets_v1(data, each, len, extra); return; } printf("\nRDS Sockets:\n%15s %5s %15s %5s %10s %10s %8s\n", "BoundAddr", "BPort", "ConnAddr", "CPort", "SndBuf", "RcvBuf", "Inode"); for_each(sk, data, each, len) { printf("%15s %5u %15s %5u %10u %10u %8Lu\n", ipv4addr(sk.bound_addr), ntohs(sk.bound_port), ipv4addr(sk.connected_addr), ntohs(sk.connected_port), sk.sndbuf, sk.rcvbuf, (unsigned long long) sk.inum); } } static void print_conns(void *data, int each, socklen_t len, void *extra) { struct rds_info_connection conn; printf("\nRDS Connections:\n%15s %15s %16s %16s %3s\n", "LocalAddr", "RemoteAddr", "NextTX", "NextRX", "Flg"); for_each(conn, data, each, len) { printf("%15s %15s %16"PRIu64" %16"PRIu64" %c%c%c\n", ipv4addr(conn.laddr), ipv4addr(conn.faddr), conn.next_tx_seq, conn.next_rx_seq, rds_conn_flag(conn, SENDING, 's'), rds_conn_flag(conn, CONNECTING, 'c'), rds_conn_flag(conn, CONNECTED, 'C')); } } static void print_msgs(void *data, int each, socklen_t len, void *extra) { struct rds_info_message msg; printf("\n%s Message Queue:\n%15s %5s %15s %5s %16s %10s\n", (char *)extra, "LocalAddr", "LPort", "RemoteAddr", "RPort", "Seq", "Bytes"); for_each(msg, data, each, len) { printf("%15s %5u %15s %5u %16"PRIu64" %10u\n", ipv4addr(msg.laddr), ntohs(msg.lport), ipv4addr(msg.faddr), ntohs(msg.fport), msg.seq, msg.len); } } static void print_tcp_socks(void *data, int each, socklen_t len, void *extra) { struct rds_info_tcp_socket ts; printf("\nTCP Connections:\n" "%15s %5s %15s %5s %10s %10s %10s %10s %10s\n", "LocalAddr", "LPort", "RemoteAddr", "RPort", "HdrRemain", "DataRemain", "SentNxt", "ExpectUna", "SeenUna"); for_each(ts, data, each, len) { printf("%15s %5u %15s %5u %10"PRIu64" %10"PRIu64" %10u %10u %10u\n", ipv4addr(ts.local_addr), ntohs(ts.local_port), ipv4addr(ts.peer_addr), ntohs(ts.peer_port), ts.hdr_rem, ts.data_rem, ts.last_sent_nxt, ts.last_expected_una, ts.last_seen_una); } } static void print_ib_conns(void *data, int each, socklen_t len, void *extra) { struct rds_info_ib_connection ic; printf("\nRDS IB Connections:\n%15s %15s %32s %32s\n", "LocalAddr", "RemoteAddr", "LocalDev", "RemoteDev"); for_each(ic, data, each, len) { printf("%15s %15s %32s %32s", ipv4addr(ic.src_addr), ipv4addr(ic.dst_addr), ipv6addr(ic.src_gid), ipv6addr(ic.dst_gid)); if (opt_verbose) { printf(" send_wr=%u", ic.max_send_wr); printf(", recv_wr=%u", ic.max_recv_wr); printf(", send_sge=%u", ic.max_send_sge); printf(", rdma_fmr_max=%u", ic.rdma_fmr_max); printf(", rdma_fmr_size=%u", ic.rdma_fmr_size); } printf("\n"); } } struct info { int opt_val; char *description; void (*print)(void *data, int each, socklen_t len, void *extra); void *extra; int option_given; }; struct info infos[] = { ['c'] = { RDS_INFO_COUNTERS, "statistic counters", print_counters, NULL, 0 }, ['k'] = { RDS_INFO_SOCKETS, "sockets", print_sockets, NULL, 0 }, ['n'] = { RDS_INFO_CONNECTIONS, "connections", print_conns, NULL, 0 }, ['r'] = { RDS_INFO_RECV_MESSAGES, "recv queue messages", print_msgs, "Receive", 0 }, ['s'] = { RDS_INFO_SEND_MESSAGES, "send queue messages", print_msgs, "Send", 0 }, ['t'] = { RDS_INFO_RETRANS_MESSAGES, "retransmit queue messages", print_msgs, "Retransmit", 0 }, ['T'] = { RDS_INFO_TCP_SOCKETS, "TCP transport sockets", print_tcp_socks, NULL, 0 }, ['I'] = { RDS_INFO_IB_CONNECTIONS, "IB transport connections", print_ib_conns, NULL, 0 }, }; void print_usage(int rc) { FILE *output = rc ? stderr : stdout; int i; verbosef(0, output, "The following options limit output to the given " "sources:\n"); for (i = 0; i < array_size(infos); i++) { if (!infos[i].opt_val) continue; printf(" -%c %s\n", i, infos[i].description); } verbosef(0, output, "\n\nIf no options are given then all sources are used.\n"); exit(rc); } void print_version() { } int main(int argc, char **argv) { char optstring[258] = "v+"; int given_options = 0; socklen_t len = 0; void *data = NULL; int fd; int each; int c; char *last; int i; /* quickly append all our info options to the optstring */ last = &optstring[strlen(optstring)]; for (i = 0; i < array_size(infos); i++) { if (!infos[i].opt_val) continue; *last = (char)i; last++; *last = '\0'; } while ((c = getopt(argc, argv, optstring)) != EOF) { switch (c) { case 'v': opt_verbose++; continue; } if (c >= array_size(infos) || !infos[c].opt_val) { verbosef(0, stderr, "%s: Invalid option \'-%c\'\n", progname, optopt); print_usage(1); } infos[c].option_given = 1; given_options++; } fd = socket(PF_RDS, SOCK_SEQPACKET, 0); if (fd < 0) { verbosef(0, stderr, "%s: Unable to create socket: %s\n", progname, strerror(errno)); return 1; } for (i = 0; i < array_size(infos); i++) { if (!infos[i].opt_val || (given_options && !infos[i].option_given)) continue; /* read in the info until we get a full snapshot */ while ((each = getsockopt(fd, SOL_RDS, infos[i].opt_val, data, &len)) < 0) { if (errno != ENOSPC) { verbosef(0, stderr, "%s: Unable get statistics: %s\n", progname, strerror(errno)); return 1; } if (data) data = realloc(data, len); else data = malloc(len); if (data == NULL) { verbosef(0, stderr, "%s: Unable to allocate memory " "for %u bytes of info: %s\n", progname, len, strerror(errno)); return 1; } } infos[i].print(data, each, len, infos[i].extra); if (given_options && --given_options == 0) break; } return 0; } trunk/rds-stress.10000644000175000017500000001644611313644724013756 0ustar benoitbenoit.Dd May 15, 2007 .Dt RDS-STRESS 1 .Os .Sh NAME .Nm rds-stress .Nd send messages between processes over RDS sockets .Pp .Sh SYNOPSIS .Nm rds-stress .Bk -words .Op Fl p Ar port_number .Op Fl r Ar receive_address .Op Fl s Ar send_address .Op Fl a Ar ack_bytes .Op Fl q Ar request_bytes .Op Fl D Ar rdma_bytes .Op Fl d Ar queue_depth .Op Fl t Ar nr_tasks .Op Fl c .Op Fl R .Op Fl V .Op Fl v .Sh DESCRIPTION .Nm rds-stress sends messages between groups tasks, usually running on seperate machines. .Pp First a passive receiving instance is started. .Pp .Dl $ rds-stress .Pp Then an active sending instance is started, giving it the address and port at which it will find a listening passive receiver. In addition, it is given configuration options which both instances will use. .Pp .Dl $ rds-stress -s recvhost -p 4000 -t 1 -d 1 .Pp The active sender will parse the options, connect to the passive receiver, and send the options over this connection. From this point on both instances exhibit the exact same behaviour. .Pp They will create a number of child tasks as specified by the -t option. Once the children are created the parent sleeps for a second at a time, printing a summary of statistics at each interval. .Pp Each child will open an RDS socket, each binding to a port number in order after the port number given on the command line. The first child would bind to port 4001 in our example. Each child sets the send and receive buffers to exactly fit the number of messages, requests and acks, that will be in flight as determind by the command line arguments. .Pp The children then enter their loop. They will keep a number of sent messages outstanding as specified by the -d option. When they reach this limit they will wait to receive acks which will allow them to send again. As they receive messages from their peers they immediately send acks. .Pp Every second, the parent process will display statistics of the ongoing stress test. The output is described in section OUTPUT below. .Pp If the -T option is given, the test will terminate after the specified time, and a summary is printed. .Pp Each child maintains outstanding messages to all other children of the other instance. They do not send to their siblings. .Sh OPTIONS The following options are available for use on the command line: .Bl -tag -width Ds .It Fl p Ar port_number Each parent binds a TCP socket to this port number and their respective address. They will trade the negotiated options over this socket. Each child will bind an RDS socket to the range of ports immediately following this port number, for as many children as there are. .It Fl s Ar send_address A connection attempt is made to this address. Once its complete and the options are sent over it then children will be created and work will proceed. .It Fl r Ar receive_address This specifies the address that messages will be sent from. If -s is not specified then rds-stress waits for a connection on this address before proceeding. .Pp If this option is not given, rds-stress will choose an appropriate address. The passive process will accept connections on all local interfaces, and obtain the address once the control connection is established. The active process will choose a local address based on the interface through which it connects to the destination address. .It Fl a Ar ack_bytes This specifies the size of the ack messages, in bytes. There is a minimum size which depends on the format of the ack messages, which may change over time. See section "Message Sizes" below. .It Fl q Ar request_bytes This specifies the size of the request messages, in bytes. It also has a minimum size which may change over time. See section "Message Sizes" below. .It Fl D Ar rdma_bytes RDSv3 is capable of transmitting part of a message via RDMA directly from application buffer to application buffer. This option enables RDMA support in rds-stress: request packets include parameters for an RDMA READ or WRITE operation, which the receiving process executes at the time the ACK packet is sent. See section "Message Sizes" below. .It Fl d Ar queue_depth Each child will try to maintain this many sent messages outstanding to each of its peers on the remote address. .It Fl t Ar nr_tasks Each parent will create this many children tasks. .It Fl T Ar seconds Specify the duration of the test run. After the specified number of seconds, all processes on both ends of the connection will terminate, and the active instance will print a summary. By default, rds-stress will keep on sending and receiving messages. .It Fl z This flag can be used in conjunction with -T. It suppresses the ongoing display of statistics, and prints a summary only. .It Fl c This causes rds-stress to create child tasks which just consume CPU cycles. One task is created for each CPU in the system. First each child observes the maximum rate at which it can consume cycles. This means that this option should only be given on an idle system. rds-stress can then calculate the CPU use of the system by observing the lesser rate at which the children consume cycles. This option is *not* shared between the active and passive instances. It must be specified on each rds-stress command line. .It Fl R This tells the rds-stress parent process to run with SCHED_RR priority, giving it precedence over the child processes. This is useful when running with lots of tasks, where there is a risk of the child processes starving the parent, and skewing the results. .It Fl v With this option enabled, packets are filled with a pattern that is verified by the receiver. This check can help detect data corruption occuring under high load. .El .Pp .Ss Message Sizes Options which set a message size (such as -a) specify a number of bytes by default. By appending \fBK\fP, \fBM\fP, or \fBG\fP, you can specify the size in kilobytes, megabytes or gigabytes, respectively. For instance, the following will run rds-stress with a message and ACK size of 1024 bytes, and an RDMA message size of 1048576 bytes: .Pp .Dl rds-stress ... -q 1K -a 1K -D 1M .Pp .Pp .Sh OUTPUT Each parent outputs columns of statistics at a regular interval: .Bl -tag -width Ds .It tsks The number of child tasks which are running. .It tx/s The number of sendmsg() calls that all children are executing, per second. .It tx+rx K/s The total number of bytes that are flowing through sendmsg() and recvmsg() for all children. This includes both request and ack messages. .It rw+rr K/s The total number of bytes that are being transferred via RDMA READs and WRITEs for all children. .It tx us/c The average number of microseconds spent in sendmsg() calls. .It rtt us The average round trip time for a request and ack message pair. This measures the total time between when a task sends a request and when it finally receives the ack for that message. Because it includes the time it takes for the receiver to wake up, receive the message, and send an ack, it can grow to be quite large under load. .It cpu % This is the percentage of available CPU resources on this machine that are being consumed since rds-stress started running. It will show -1.00 if -c is not given. It is calculated based on the amount of CPU resources that CPU soaking tasks are able to consume. This lets it measure CPU use by the system, say in interrupt handlers, that task-based CPU accounting does not include. For this to work rds-stress must be started with -c on an idle system. .El trunk/rds-stress.c0000644000175000017500000020566411313644724014042 0ustar benoitbenoit#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "net/rds.h" #ifdef DYNAMIC_PF_RDS #include "pfhack.h" #endif /* * * TODO * - checksum the data some day. * - use poll to wait instead of blocking recvmsg? doesn't seem great. * - measure us/call of nonblocking recvmsg * - do something about receiver congestion * - notice when parent tcp socket dies * - should the parent be at a higher priority? * - catch ctl-c * - final stats summary page */ struct options { uint32_t req_depth; uint32_t req_size; uint32_t ack_size; uint32_t rdma_size; uint32_t send_addr; uint32_t receive_addr; uint16_t starting_port; uint16_t nr_tasks; uint32_t run_time; uint8_t summary_only; uint8_t rtprio; uint8_t tracing; uint8_t verify; uint8_t show_params; uint8_t show_perfdata; uint8_t use_cong_monitor; uint8_t rdma_use_once; uint8_t rdma_use_get_mr; uint8_t rdma_use_fence; uint8_t rdma_cache_mrs; uint8_t rdma_key_o_meter; uint8_t suppress_warnings; uint32_t rdma_alignment; uint32_t connect_retries; } __attribute__((packed)); static struct options opt; static int control_fd; struct counter { uint64_t nr; uint64_t sum; uint64_t min; uint64_t max; }; enum { S_REQ_TX_BYTES = 0, S_REQ_RX_BYTES, S_ACK_TX_BYTES, S_ACK_RX_BYTES, S_RDMA_WRITE_BYTES, S_RDMA_READ_BYTES, S_SENDMSG_USECS, S_RTT_USECS, S__LAST }; #define NR_STATS S__LAST /* * Parents share a mapped array of these with their children. Each child * gets one. It's used to communicate between the child and the parent * simply. */ struct child_control { pid_t pid; int ready; struct timeval start; struct counter cur[NR_STATS]; struct counter last[NR_STATS]; } __attribute__((aligned (256))); /* arbitrary */ struct soak_control { pid_t pid; uint64_t per_sec; uint64_t counter; uint64_t last; struct timeval start; } __attribute__((aligned (256))); /* arbitrary */ void stop_soakers(struct soak_control *soak_arr); /* * Requests tend to be larger and we try to keep a certain number of them * in flight at a time. Acks are sent in response to requests and tend * to be smaller. */ #define OP_REQ 1 #define OP_ACK 2 #define RDMA_OP_READ 1 #define RDMA_OP_WRITE 2 #define RDMA_OP_TOGGLE(x) (3 - (x)) /* read becomes write and vice versa */ /* * Every message sent with sendmsg gets a header. This lets the receiver * verify that it got what was sent. */ struct header { uint32_t seq; uint32_t from_addr; uint32_t to_addr; uint16_t from_port; uint16_t to_port; uint16_t index; uint8_t op; /* RDMA related. * rdma_op must be the first field, because we * use offsetof(rdma_op) in fill_hdr and check_hdr */ uint8_t rdma_op; uint64_t rdma_addr; uint64_t rdma_phyaddr; uint64_t rdma_pattern; uint64_t rdma_key; uint32_t rdma_size; uint8_t data[0]; } __attribute__((packed)); #define MIN_MSG_BYTES (sizeof(struct header)) #define BASIC_HEADER_SIZE (size_t)(&((struct header *) 0)->rdma_op) #define die(fmt...) do { \ fprintf(stderr, fmt); \ exit(1); \ } while (0) #define die_errno(fmt, args...) do { \ fprintf(stderr, fmt ", errno: %d (%s)\n", ##args , errno,\ strerror(errno)); \ exit(1); \ } while (0) static int mrs_allocated = 0; #define trace(fmt...) do { \ if (opt.tracing) \ fprintf(stderr, fmt); \ } while (0) #define min(a,b) (a < b ? a : b) #define max(a,b) (a > b ? a : b) static unsigned long sys_page_size; /* This macro casts a pointer to uint64_t without producing warnings on either 32bit or 64bit platforms. At least with gcc, that is. */ #define ptr64(p) ((unsigned long) (p)) /* zero is undefined */ static inline uint64_t minz(uint64_t a, uint64_t b) { if (a == 0) return b; if (b == 0) return a; return min(a, b); } static unsigned long long parse_ull(char *ptr, unsigned long long max) { unsigned long long val; char *endptr; val = strtoull(ptr, &endptr, 0); switch (*endptr) { case 'k': case 'K': val <<= 10; endptr++; break; case 'm': case 'M': val <<= 20; endptr++; break; case 'g': case 'G': val <<= 30; endptr++; break; } if (*ptr && !*endptr && val <= max) return val; die("invalid number '%s'\n", ptr); } static uint32_t parse_addr(char *ptr) { uint32_t addr; struct hostent *hent; hent = gethostbyname(ptr); if (hent && hent->h_addrtype == AF_INET && hent->h_length == sizeof(addr)) { memcpy(&addr, hent->h_addr, sizeof(addr)); return ntohl(addr); } die("invalid host name or dotted quad '%s'\n", ptr); } static void usage(void) { printf( "\n" "Send & Recv parameters:\n" " -r [addr] use this local address\n" " -p [port, 4000] starting port number\n" "\n" "Send parameters:\n" " -s [addr] send to this address (required)\n" " -a [bytes, %u] ack message length\n" " -q [bytes, 1024] request message length\n" " -d [depth, 1] request pipeline depth, nr outstanding\n" " -t [nr, 1] number of child tasks\n" " -T [seconds, 0] runtime of test, 0 means infinite\n" " -D [bytes] RDMA size (RDSv3 only)\n" "\n" "Optional flags:\n" " -c measure cpu use with per-cpu soak processes\n" " -V trace execution\n" " -z print a summary at end of test only\n" "\n" "Example:\n" " recv$ rds-stress\n" " send$ rds-stress -s recv -q 4096 -t 2 -d 2\n" "\n", (int) MIN_MSG_BYTES); exit(2); } static void set_rt_priority(void) { struct sched_param param; memset(¶m, 0, sizeof(param)); param.sched_priority = 1; if (sched_setscheduler(0, SCHED_RR, ¶m) < 0) die_errno("sched_setscheduler(SCHED_RR) failed"); } /* This hack lets children notice when their parents die. * We could also use kill(0), but that results in false * positives when the parent is a zombie (and that happens * if you have a script parsing the output of rds-stress, * and the parent dies). */ static void check_parent(pid_t pid) { if (pid != getppid()) die("parent %u exited\n", pid); } /* * put a pattern in the message so the remote side can verify that it's * what was expected. */ static unsigned char * msg_pattern; static void init_msg_pattern(struct options *opts) { unsigned int max_size = max(opts->req_size, opts->ack_size); unsigned int i, k = 11; msg_pattern = malloc(max_size); /* k = 41 * (k + 3) is a generator of Z(256). Adding * (i >> 8) makes sure the pattern is shifted by 1 in * every successive 256 byte block, so that we can detect * swapped blocks. */ for (i = 0; i < max_size; i++, k = 41 * (k + 3) + (i >> 8)) msg_pattern[i] = k; } #if __BYTE_ORDER == __LITTLE_ENDIAN #define htonll(x) bswap_64(x) #define ntohll(x) bswap_64(x) #else #define htonll(x) (x) #define ntohll(x) (x) #endif static void encode_hdr(struct header *dst, const struct header *hdr) { memset(dst, 0, sizeof(*dst)); dst->seq = htonl(hdr->seq); dst->from_addr = hdr->from_addr; /* always network byte order */ dst->from_port = hdr->from_port; /* ditto */ dst->to_addr = hdr->to_addr; /* ditto */ dst->to_port = hdr->to_port; /* ditto */ dst->index = htons(hdr->index); dst->op = hdr->op; dst->rdma_op = hdr->rdma_op; dst->rdma_addr = htonll(hdr->rdma_addr); dst->rdma_phyaddr = htonll(hdr->rdma_phyaddr); dst->rdma_pattern = htonll(hdr->rdma_pattern); dst->rdma_key = htonll(hdr->rdma_key); dst->rdma_size = htonl(hdr->rdma_size); } static void decode_hdr(struct header *dst, const struct header *hdr) { memset(dst, 0, sizeof(*dst)); dst->seq = ntohl(hdr->seq); dst->from_addr = hdr->from_addr; /* always network byte order */ dst->from_port = hdr->from_port; /* ditto */ dst->to_addr = hdr->to_addr; /* ditto */ dst->to_port = hdr->to_port; /* ditto */ dst->index = ntohs(hdr->index); dst->op = hdr->op; dst->rdma_op = hdr->rdma_op; dst->rdma_addr = ntohll(hdr->rdma_addr); dst->rdma_phyaddr = ntohll(hdr->rdma_phyaddr); dst->rdma_pattern = ntohll(hdr->rdma_pattern); dst->rdma_key = ntohll(hdr->rdma_key); dst->rdma_size = ntohl(hdr->rdma_size); } static void fill_hdr(void *message, uint32_t bytes, struct header *hdr) { encode_hdr(message, hdr); if (opt.verify) memcpy(message + sizeof(*hdr), msg_pattern, bytes - sizeof(*hdr)); } /* inet_ntoa uses a static buffer, so calling it twice in * a single printf as we do below will produce undefined * results. We copy the output to two static buffers, * and switch between them. */ static char *inet_ntoa_32(uint32_t val) { struct in_addr addr = { .s_addr = val }; static char buffer[2][64]; static unsigned int select = 0; select = 1 - select; strncpy(buffer[select], inet_ntoa(addr), 63); return buffer[select]; } /* * Compare incoming message header with expected header. All header fields * are in host byte order except for address and port fields. */ static int check_hdr(void *message, uint32_t bytes, const struct header *hdr) { struct header msghdr; decode_hdr(&msghdr, message); if (memcmp(&msghdr, hdr, BASIC_HEADER_SIZE)) { #define bleh(var, disp) \ disp(hdr->var), \ msghdr.var == hdr->var ? " =" : "!=", \ disp(msghdr.var) /* * This is printed as one GIANT printf() so that it serializes * with stdout() and we don't get things stomping on each * other */ printf( "An incoming message had a header which\n" "didn't contain the fields we expected:\n" " member expected eq got\n" " seq %15u %s %15u\n" " from_addr %15s %s %15s\n" " from_port %15u %s %15u\n" " to_addr %15s %s %15s\n" " to_port %15u %s %15u\n" " index %15u %s %15u\n" " op %15u %s %15u\n", bleh(seq, /**/), bleh(from_addr, inet_ntoa_32), bleh(from_port, ntohs), bleh(to_addr, inet_ntoa_32), bleh(to_port, ntohs), bleh(index, /**/), bleh(op, /**/)); #undef bleh return 1; } if (opt.verify && memcmp(message + sizeof(*hdr), msg_pattern, bytes - sizeof(*hdr))) { unsigned char *p = message + sizeof(*hdr); unsigned int i, count = 0, total = bytes - sizeof(*hdr); int offset = -1; for (i = 0; i < total; ++i) { if (p[i] != msg_pattern[i]) { if (offset < 0) offset = i; count++; } } printf("An incoming message has a corrupted payload at offset %u; " "%u out of %u bytes corrupted\n", offset, count, total); return 1; } return 0; } void stat_inc(struct counter *ctr, uint64_t val) { ctr->nr++; ctr->sum += val; ctr->min = minz(val, ctr->min); ctr->max = max(val, ctr->max); } int64_t tv_cmp(const struct timeval *a, const struct timeval *b) { int64_t a_usecs = ((uint64_t)a->tv_sec * 1000000ULL) + a->tv_usec; int64_t b_usecs = ((uint64_t)b->tv_sec * 1000000ULL) + b->tv_usec; return a_usecs - b_usecs; } /* returns a - b in usecs */ uint64_t usec_sub(struct timeval *a, struct timeval *b) { return ((uint64_t)(a->tv_sec - b->tv_sec) * 1000000ULL) + a->tv_usec - b->tv_usec; } static int bound_socket(int domain, int type, int protocol, struct sockaddr_in *sin) { int fd; int opt; fd = socket(domain, type, protocol); if (fd < 0) die_errno("socket(%d, %d, %d) failed", domain, type, protocol); opt = 1; if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt))) die_errno("setsockopt(SO_REUSEADDR) failed"); if (bind(fd, (struct sockaddr *)sin, sizeof(struct sockaddr_in))) die_errno("bind() failed"); return fd; } static uint32_t get_local_address(int fd, struct sockaddr_in *sin) { socklen_t alen = sizeof(*sin); if (getsockname(fd, (struct sockaddr *) sin, &alen)) die_errno("getsockname failed"); return ntohl(sin->sin_addr.s_addr); } static int rds_socket(struct options *opts, struct sockaddr_in *sin) { int bytes; int fd; int val; socklen_t optlen; fd = bound_socket(PF_RDS, SOCK_SEQPACKET, 0, sin); bytes = opts->nr_tasks * opts->req_depth * (opts->req_size + opts->ack_size) * 2; if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &bytes, sizeof(bytes))) die_errno("setsockopt(SNDBUF, %d) failed", bytes); if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &bytes, sizeof(bytes))) die_errno("setsockopt(RCVBUF, %d) failed", bytes); optlen = sizeof(val); if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &val, &optlen)) die_errno("getsockopt(SNDBUF) failed"); if (val / 2 < bytes && !opts->suppress_warnings) fprintf(stderr, "getsockopt(SNDBUF) returned %d, we wanted %d * 2\n", val, bytes); optlen = sizeof(val); if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, &optlen)) die_errno("getsockopt(RCVBUF) failed"); if (val / 2 < bytes && !opts->suppress_warnings) fprintf(stderr, "getsockopt(RCVBUF) returned %d, we need %d * 2\n", val, bytes); val = 1; if (opts->use_cong_monitor && setsockopt(fd, SOL_RDS, RDS_CONG_MONITOR, &val, sizeof(val))) { if (errno != ENOPROTOOPT) die_errno("setsockopt(RDS_CONG_MONITOR) failed"); printf("Kernel does not support congestion monitoring; disabled\n"); opts->use_cong_monitor = 0; } fcntl(fd, F_SETFL, O_NONBLOCK); return fd; } static int check_rdma_support(struct options *opts) { struct sockaddr_in sin; struct rds_free_mr_args args; int fd, okay = 0; /* We need a local address to bind to. If the user * didn't specify the -r option, we tell him to go on for * now - he'll call back once more later. */ if (opts->receive_addr == 0) return 1; sin.sin_family = AF_INET; sin.sin_port = htons(opts->starting_port); sin.sin_addr.s_addr = htonl(opts->receive_addr); fd = bound_socket(AF_RDS, SOCK_SEQPACKET, 0, &sin); memset(&args, 0, sizeof(args)); if (setsockopt(fd, SOL_RDS, RDS_FREE_MR, &args, sizeof(args)) >= 0) { okay = 1; } else if (errno == ENOPROTOOPT) { okay = 0; } else { die_errno("%s: RDS_FREE_MR failed with unexpected error", __FUNCTION__); } close(fd); return okay; } static uint64_t get_rdma_key(int fd, uint64_t addr, uint32_t size) { uint64_t cookie = 0; struct rds_get_mr_args mr_args; mr_args.vec.addr = addr; mr_args.vec.bytes = size; mr_args.cookie_addr = ptr64(&cookie); mr_args.flags = RDS_RDMA_READWRITE; /* for now, always assume r/w */ if (opt.rdma_use_once) mr_args.flags |= RDS_RDMA_USE_ONCE; if (setsockopt(fd, SOL_RDS, RDS_GET_MR, &mr_args, sizeof(mr_args))) die_errno("setsockopt(RDS_GET_MR) failed (%u allocated)", mrs_allocated); trace("RDS get_rdma_key() = %Lx\n", (unsigned long long) cookie); mrs_allocated++; return cookie; } static void free_rdma_key(int fd, uint64_t key) { struct rds_free_mr_args mr_args; trace("RDS free_rdma_key(%Lx)\n", (unsigned long long) key); mr_args.cookie = key; #if 1 mr_args.flags = 0; #else mr_args.flags = RDS_FREE_MR_ARGS_INVALIDATE; #endif if (setsockopt(fd, SOL_RDS, RDS_FREE_MR, &mr_args, sizeof(mr_args))) die_errno("setsockopt(RDS_FREE_MR) failed"); mrs_allocated--; } /* * RDMA key-o-meter. We track how frequently the kernel * re-issues R_Keys * * The key_o_meter data structures are shared between the processes * without any locking. We don't care much for locking here... */ #define RDMA_MAX_TRACKED_KEYS (32*1024) struct rdma_key_stamp { uint32_t r_key; struct timeval issued; }; struct rdma_key_trace { uint32_t count, max; struct rdma_key_stamp *entry; }; struct rdma_key_o_meter { struct rdma_key_trace *current; struct rdma_key_trace *idle; }; static struct rdma_key_o_meter *rdma_key_o_meter; static unsigned int rdma_key_task; static void rdma_key_o_meter_init(unsigned int nr_tasks) { struct rdma_key_trace *kt; struct rdma_key_stamp *ks; uint32_t max; unsigned int i, size; void *base; size = sizeof(struct rdma_key_o_meter) + 2 * nr_tasks * sizeof(*kt) + 2 * RDMA_MAX_TRACKED_KEYS * sizeof(*ks); base = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, 0, 0); if (base == MAP_FAILED) die_errno("alloc_rdma_buffers: mmap failed"); rdma_key_o_meter = (struct rdma_key_o_meter *) base; base = rdma_key_o_meter + 1; rdma_key_o_meter->current = (struct rdma_key_trace *) base; base = rdma_key_o_meter->current + nr_tasks; rdma_key_o_meter->idle = (struct rdma_key_trace *) base; base = rdma_key_o_meter->idle + nr_tasks; ks = (struct rdma_key_stamp *) base; max = RDMA_MAX_TRACKED_KEYS / nr_tasks; for (i = 0, kt = rdma_key_o_meter->current; i < 2 * nr_tasks; ++i, ++kt) { kt->count = 0; kt->max = max; kt->entry = ks + i * max; } } /* This is called in the child process to set the index of * the key-o-meter to use */ static void rdma_key_o_meter_set_self(unsigned int task_idx) { rdma_key_task = task_idx; } static void rdma_key_o_meter_add(uint32_t key) { struct rdma_key_trace *kt; if (!rdma_key_o_meter) return; kt = &rdma_key_o_meter->current[rdma_key_task]; if (kt->count < kt->max) { kt->entry[kt->count].r_key = key; gettimeofday(&kt->entry[kt->count].issued, NULL); kt->count++; } } static int rdma_key_stamp_compare(const void *p1, const void *p2) { const struct rdma_key_stamp *ks1 = p1, *ks2 = p2; if (ks1->r_key < ks2->r_key) return -1; if (ks1->r_key > ks2->r_key) return 1; return tv_cmp(&ks1->issued, &ks2->issued); } static void rdma_key_o_meter_check(unsigned int nr_tasks) { struct rdma_key_stamp *ks, sorted[RDMA_MAX_TRACKED_KEYS]; struct rdma_key_trace *kt; unsigned int i, j, count = 0; unsigned int reissued = 0; double min_elapsed = 0, avg_elapsed = 0; if (!rdma_key_o_meter) return; /* Extract keys from all tasks and sort them. */ kt = rdma_key_o_meter->idle; for (i = 0; i < nr_tasks; ++i, ++kt) { ks = kt->entry; for (j = 0; j < kt->count; ++j) sorted[count++] = *ks++; kt->count = 0; } qsort(sorted, count, sizeof(*sorted), rdma_key_stamp_compare); /* Now see how many were reissued */ ks = sorted; for (i = 0; i + 1 < count; ++i, ++ks) { double elapsed; if (ks[0].r_key != ks[1].r_key) continue; elapsed = 1e-6 * usec_sub(&ks[1].issued, &ks[0].issued); if (reissued == 0 || elapsed < min_elapsed) min_elapsed = elapsed; avg_elapsed += elapsed; } if (reissued) printf(" *** %u R_Keys were re-issued; min distance=%f sec, avg distance=%f sec\n", reissued, min_elapsed, avg_elapsed / reissued); /* Swap current and idle */ kt = rdma_key_o_meter->current; rdma_key_o_meter->current = rdma_key_o_meter->idle; rdma_key_o_meter->idle = kt; } static void rds_fill_buffer(void *buf, size_t size, uint64_t pattern) { uint64_t *pos, *end; pos = (uint64_t *) buf; end = (uint64_t *) (buf + size); while (pos < end) *pos++ = pattern; } #if 0 static void rds_dump_buffer(const void *buf, size_t size) { const uint64_t *pos; unsigned int i, count; pos = (const uint64_t *) buf; count = size / sizeof(uint64_t); pos = (const uint64_t *) buf; printf("rds_dump_buffer(%p, %u)\n", buf, (int) size); for (i = 0; i < count; ++i) { if ((i % 4) == 0) printf("\n%08x:", i); printf(" %016Lx", (unsigned long long) *pos++); } } #endif static void rds_compare_buffer(uint64_t *addr, int size, uint64_t pattern) { int d, failed = 0; for (d = 0; d < size / sizeof(uint64_t); d++) { if (addr[d] == pattern) continue; failed = 1; trace("compare fail pattern offset %u: expected %Lx got %Lx\n", 8 * d, (unsigned long long) pattern, (unsigned long long) addr[d]); #if 0 rds_dump_buffer(addr, size); die("compare pass\n"); #endif } if (!failed) trace("compare pass pattern %Lx addr %p\n", (unsigned long long) pattern, addr); } struct task { unsigned int nr; unsigned int pending; unsigned int unacked; struct sockaddr_in src_addr; /* same for all tasks */ struct sockaddr_in dst_addr; unsigned char congested; unsigned char drain_rdmas; uint32_t send_seq; uint32_t recv_seq; uint16_t send_index; uint16_t recv_index; struct timeval * send_time; struct header * ack_header; /* RDMA related stuff */ uint64_t ** local_buf; uint64_t ** rdma_buf; uint64_t * rdma_req_key; uint8_t * rdma_inflight; uint32_t buffid; uint8_t rdma_next_op; }; static void alloc_rdma_buffers(struct task *t, struct options *opts) { unsigned int i, j; size_t len; caddr_t base; /* We use mmap here rather than malloc, because it is always * page aligned. */ len = 2 * opts->nr_tasks * opts->req_depth * opts->rdma_size + sys_page_size; base = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, 0, 0); if (base == MAP_FAILED) die_errno("alloc_rdma_buffers: mmap failed"); memset(base, 0x2f, len); base += opts->rdma_alignment; for (i = 0; i < opts->nr_tasks; ++i, ++t) { for (j = 0; j < opts->req_depth; ++j) { t->rdma_buf[j] = (uint64_t *) base; base += opts->rdma_size; t->local_buf[j] = (uint64_t *) base; base += opts->rdma_size; t->rdma_req_key[j] = 0; t->rdma_inflight[j] = 0; } } } static void rdma_build_req(int fd, struct header *hdr, struct task *t, unsigned int rdma_size, unsigned int req_depth) { uint64_t *rdma_addr, *rdma_key_p; rdma_addr = t->rdma_buf[t->send_index]; rdma_key_p = &t->rdma_req_key[t->send_index]; if (opt.rdma_use_get_mr && *rdma_key_p == 0) *rdma_key_p = get_rdma_key(fd, ptr64(rdma_addr), rdma_size); /* We alternate between RDMA READ and WRITEs */ hdr->rdma_op = t->rdma_next_op; t->rdma_next_op = RDMA_OP_TOGGLE(t->rdma_next_op); hdr->rdma_pattern = (((uint64_t) t->send_seq) << 32) | getpid(); hdr->rdma_addr = ptr64(rdma_addr); hdr->rdma_phyaddr = 0; hdr->rdma_size = rdma_size; hdr->rdma_key = *rdma_key_p; if (RDMA_OP_READ == hdr->rdma_op) { if (opt.verify) rds_fill_buffer(rdma_addr, rdma_size, hdr->rdma_pattern); trace("Requesting RDMA read for pattern %Lx " "local addr to rdma read %p\n", (unsigned long long) hdr->rdma_pattern, rdma_addr); } else { if (opt.verify) rds_fill_buffer(rdma_addr, rdma_size, 0); trace("Requesting RDMA write for pattern %Lx " "local addr to rdma write %p\n", (unsigned long long) hdr->rdma_pattern, rdma_addr); } } static void rdma_validate(const struct header *in_hdr, struct options *opts) { unsigned long rdma_size; rdma_size = in_hdr->rdma_size; if (rdma_size != opts->rdma_size) die("Unexpected RDMA size %lu in request\n", rdma_size); if (in_hdr->rdma_op != RDMA_OP_READ && in_hdr->rdma_op != RDMA_OP_WRITE) die("Unexpected RDMA op %u in request\n", in_hdr->rdma_op); trace("RDS received request to issue rdma %s len %lu rva %Lx key %Lx pattern %Lx\n", in_hdr->rdma_op == RDMA_OP_WRITE? "write to" : "read from", rdma_size, (unsigned long long) in_hdr->rdma_addr, (unsigned long long) in_hdr->rdma_key, (unsigned long long) in_hdr->rdma_pattern); } static void rdma_build_ack(struct header *hdr, const struct header *in_hdr) { hdr->rdma_op = in_hdr->rdma_op; hdr->rdma_size = in_hdr->rdma_size; hdr->rdma_key = in_hdr->rdma_key; hdr->rdma_phyaddr = in_hdr->rdma_phyaddr; /* remote's address to rdma to / from */ hdr->rdma_addr = in_hdr->rdma_addr; /* remote's address to rdma to / from */ hdr->rdma_pattern = in_hdr->rdma_pattern; } static inline unsigned int rdma_user_token(struct task *t, unsigned int qindex) { return t->nr * opt.req_depth + qindex; } static void rdma_mark_completed(struct task *tasks, unsigned int token, int status) { struct task *t; unsigned int i; trace("RDS rdma completion for token %x\n", token); t = &tasks[token / opt.req_depth]; i = token % opt.req_depth; if (status) { const char *errmsg; switch (status) { case RDS_RDMA_REMOTE_ERROR: errmsg = "remote error"; break; case RDS_RDMA_CANCELED: errmsg = "operation was cancelled"; break; case RDS_RDMA_DROPPED: errmsg = "operation was dropped"; break; case RDS_RDMA_OTHER_ERROR: errmsg = "other error"; break; default: errmsg = "unknown error"; break; } printf("%s:%u: RDMA op %u failed: %s\n", inet_ntoa(t->dst_addr.sin_addr), ntohs(t->dst_addr.sin_port), i, errmsg); } t->rdma_inflight[i] = 0; t->drain_rdmas = 0; } #define MSG_MAXIOVLEN 2 /* * Add a control message to the outgoing message */ static void rdma_put_cmsg(struct msghdr *msg, int type, const void *ptr, size_t size) { static char ctlbuf[1024]; struct cmsghdr *cmsg; msg->msg_control = ctlbuf; msg->msg_controllen = CMSG_SPACE(size); cmsg = CMSG_FIRSTHDR(msg); cmsg->cmsg_level = SOL_RDS; cmsg->cmsg_type = type; cmsg->cmsg_len = CMSG_LEN(size); memcpy(CMSG_DATA(cmsg), ptr, size); } /* * This sets up all the fields for an RDMA transfer. * The request is passed as a control message along with * the ACK packet. */ static void rdma_build_cmsg_xfer(struct msghdr *msg, const struct header *hdr, unsigned int user_token, void *local_buf) { static struct rds_iovec iov; struct rds_rdma_args args; unsigned int rdma_size; rdma_size = hdr->rdma_size; trace("RDS issuing rdma for token %x key %Lx len %u local_buf %p\n", user_token, (unsigned long long) hdr->rdma_key, rdma_size, local_buf); /* rdma args */ memset(&args, 0, sizeof(args)); /* Set up the iovec pointing to the RDMA buffer */ args.local_vec_addr = (uint64_t) &iov; args.nr_local = 1; iov.addr = ptr64(local_buf); iov.bytes = rdma_size; /* The remote could either give us a physical address, or * an index into a zero-based FMR. Either way, we just copy it. */ args.remote_vec.addr = hdr->rdma_phyaddr; args.remote_vec.bytes = rdma_size; args.cookie = hdr->rdma_key; /* read or write */ switch (hdr->rdma_op) { case RDMA_OP_WRITE: args.flags = RDS_RDMA_READWRITE; if (opt.verify) rds_fill_buffer(local_buf, rdma_size, hdr->rdma_pattern); break; case RDMA_OP_READ: args.flags = 0; break; } /* Fence off subsequent SENDs - this is the default */ if (opt.rdma_use_fence) args.flags |= RDS_RDMA_FENCE; args.flags |= RDS_RDMA_NOTIFY_ME; args.user_token = user_token; rdma_put_cmsg(msg, RDS_CMSG_RDMA_ARGS, &args, sizeof(args)); } static void rdma_build_cmsg_dest(struct msghdr *msg, rds_rdma_cookie_t rdma_dest) { rdma_put_cmsg(msg, RDS_CMSG_RDMA_DEST, &rdma_dest, sizeof(rdma_dest)); } static void rdma_build_cmsg_map(struct msghdr *msg, uint64_t addr, uint32_t size, rds_rdma_cookie_t *cookie) { struct rds_get_mr_args args; args.vec.addr = addr; args.vec.bytes = size; args.cookie_addr = ptr64(cookie); args.flags = RDS_RDMA_READWRITE; /* for now, always assume r/w */ if (opt.rdma_use_once) args.flags |= RDS_RDMA_USE_ONCE; rdma_put_cmsg(msg, RDS_CMSG_RDMA_MAP, &args, sizeof(args)); } static void rdma_process_ack(int fd, struct header *hdr, struct child_control *ctl) { trace("RDS rcvd rdma %s ACK for request key %Lx len %u local addr %Lx\n", RDMA_OP_WRITE == hdr->rdma_op ? "write" : "read", (unsigned long long) hdr->rdma_key, hdr->rdma_size, (unsigned long long) hdr->rdma_addr); /* Need to free the MR unless allocated with use_once */ if (!opt.rdma_use_once && !opt.rdma_cache_mrs) free_rdma_key(fd, hdr->rdma_key); /* if acking an rdma write request - then remote node wrote local host buffer * (data in) so count this as rdma data coming in (rdma_read) - else remote node read * local host buffer so count this as rdma write (data out) */ switch (hdr->rdma_op) { case RDMA_OP_WRITE: /* remote node wrote local buffer check pattern * sent via immediate data in rdma buffer */ stat_inc(&ctl->cur[S_RDMA_READ_BYTES], hdr->rdma_size); if (opt.verify) { /* This funny looking cast avoids compile warnings * on 32bit platforms. */ rds_compare_buffer((void *)(unsigned long) hdr->rdma_addr, hdr->rdma_size, hdr->rdma_pattern); } break; case RDMA_OP_READ: stat_inc(&ctl->cur[S_RDMA_WRITE_BYTES], hdr->rdma_size); break; } } static void build_header(struct task *t, struct header *hdr, unsigned int op, unsigned int qindex) { memset(hdr, 0, sizeof(*hdr)); hdr->op = op; hdr->seq = t->send_seq; hdr->from_addr = t->src_addr.sin_addr.s_addr; hdr->from_port = t->src_addr.sin_port; hdr->to_addr = t->dst_addr.sin_addr.s_addr; hdr->to_port = t->dst_addr.sin_port; hdr->index = qindex; } static int send_packet(int fd, struct task *t, struct header *hdr, unsigned int size) { unsigned char buf[size], *rdma_flight_recorder = NULL; rds_rdma_cookie_t cookie = 0; struct msghdr msg; struct iovec iov; ssize_t ret; /* Make sure we always have the current sequence number. * When we send ACK packets, the seq that gets filled in is * stale. */ hdr->seq = t->send_seq; fill_hdr(buf, size, hdr); memset(&msg, 0, sizeof(msg)); msg.msg_name = (struct sockaddr *) &t->dst_addr; msg.msg_namelen = sizeof(t->dst_addr); msg.msg_iovlen = 1; msg.msg_iov = &iov; iov.iov_base = buf; iov.iov_len = size; /* If this is a REQ packet in which we pass the MR to the * peer, extract the RDMA cookie and pass it on in the control * message for now. */ if (hdr->op == OP_REQ && hdr->rdma_op != 0) { if (hdr->rdma_key != 0) { /* We used GET_MR to obtain a key */ rdma_build_cmsg_dest(&msg, hdr->rdma_key); cookie = hdr->rdma_key; hdr->rdma_key = 0; } else { /* Use the RDMA_MAP cmsg to have sendmsg do the * mapping on the fly. */ rdma_build_cmsg_map(&msg, hdr->rdma_addr, hdr->rdma_size, &cookie); } } /* If this is an ACK packet with RDMA, build the cmsg * header that goes with it. */ if (hdr->op == OP_ACK && hdr->rdma_op != 0) { unsigned int qindex = hdr->index; if (t->rdma_inflight[qindex] != 0) { /* It is unlikely but (provably) possible for * new requests to arrive before the RDMA notification. * That's because RDMA notifications are triggered * by the RDS ACK processing, which happens after new * messages were queued on the socket. * * We return one of the more obscure error messages, * which we recognize and handle in the top loop. */ trace("Drain RDMA 0x%x\n", rdma_user_token(t, qindex)); errno = EBADSLT; return -1; } rdma_build_cmsg_xfer(&msg, hdr, rdma_user_token(t, qindex), t->local_buf[qindex]); rdma_flight_recorder = &t->rdma_inflight[qindex]; } ret = sendmsg(fd, &msg, 0); if (ret < 0) { if (errno != EAGAIN && errno != ENOBUFS) die_errno("sendto() failed"); return ret; } if (ret != size) die("sendto() truncated - %zd", ret); if (rdma_flight_recorder) *rdma_flight_recorder = 1; if (cookie) { /* We just happen to know that the r_key is in the * lower 32bit of the cookie */ rdma_key_o_meter_add(cookie); } t->send_seq++; return ret; } static int send_one(int fd, struct task *t, struct options *opts, struct child_control *ctl) { struct timeval start; struct timeval stop; struct header hdr; int ret; build_header(t, &hdr, OP_REQ, t->send_index); if (opts->rdma_size && t->send_seq > 10) rdma_build_req(fd, &hdr, t, opts->rdma_size, opts->req_depth); gettimeofday(&start, NULL); ret = send_packet(fd, t, &hdr, opts->req_size); gettimeofday(&stop, NULL); if (ret < 0) return ret; t->send_time[t->send_index] = start; if (!opts->rdma_cache_mrs) t->rdma_req_key[t->send_index] = 0; /* we consumed this key */ stat_inc(&ctl->cur[S_REQ_TX_BYTES], ret); stat_inc(&ctl->cur[S_SENDMSG_USECS], usec_sub(&stop, &start)); t->send_index = (t->send_index + 1) % opts->req_depth; t->pending++; return ret; } static int send_ack(int fd, struct task *t, unsigned int qindex, struct options *opts, struct child_control *ctl) { struct header *hdr = &t->ack_header[qindex]; ssize_t ret; /* send an ack in response to the req we just got */ ret = send_packet(fd, t, hdr, opts->ack_size); if (ret < 0) return ret; if (ret != opts->ack_size) die_errno("sendto() returned %zd", ret); stat_inc(&ctl->cur[S_ACK_TX_BYTES], ret); /* need separate rdma stats cells for send/recv */ switch (hdr->rdma_op) { case RDMA_OP_WRITE: stat_inc(&ctl->cur[S_RDMA_WRITE_BYTES], opts->rdma_size); break; case RDMA_OP_READ: stat_inc(&ctl->cur[S_RDMA_READ_BYTES], opts->rdma_size); break; } return ret; } static int ack_anything(int fd, struct task *t, struct options *opts, struct child_control *ctl, int can_send) { while (t->unacked) { uint16_t qindex; qindex = (t->recv_index - t->unacked + opts->req_depth) % opts->req_depth; if (!can_send) goto eagain; if (send_ack(fd, t, qindex, opts, ctl) < 0) return -1; t->unacked -= 1; } return 0; eagain: errno = EAGAIN; return -1; } static int send_anything(int fd, struct task *t, struct options *opts, struct child_control *ctl, int can_send) { if (ack_anything(fd, t, opts, ctl, can_send) < 0) return -1; while (t->pending < opts->req_depth) { if (!can_send) goto eagain; if (send_one(fd, t, opts, ctl) < 0) return -1; } return 0; eagain: errno = EAGAIN; return -1; } static int recv_message(int fd, void *buffer, size_t size, rds_rdma_cookie_t *cookie, struct sockaddr_in *sin, struct timeval *tstamp, struct task *tasks) { struct cmsghdr *cmsg; char cmsgbuf[256]; struct msghdr msg; struct iovec iov; ssize_t ret; memset(&msg, 0, sizeof(msg)); msg.msg_name = (struct sockaddr *) sin; msg.msg_namelen = sizeof(struct sockaddr_in); msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_control = cmsgbuf; msg.msg_controllen = sizeof(cmsgbuf); iov.iov_base = buffer; iov.iov_len = size; ret = recvmsg(fd, &msg, MSG_DONTWAIT); gettimeofday(tstamp, NULL); if (ret < 0) return ret; if (ret && ret < sizeof(struct header)) die("recvmsg() returned short data: %zd", ret); if (msg.msg_namelen < sizeof(struct sockaddr_in)) die("socklen = %d < sizeof(sin) (%zu)\n", msg.msg_namelen, sizeof(struct sockaddr_in)); /* See if the message comes with a RDMA destination */ for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { struct rds_rdma_notify notify; if (cmsg->cmsg_level != SOL_RDS) continue; switch (cmsg->cmsg_type) { case RDS_CMSG_CONG_UPDATE: if (cmsg->cmsg_len < CMSG_LEN(sizeof(uint64_t))) die("RDS_CMSG_CONG_UPDATE data too small"); else { unsigned int i, port; uint64_t mask; memcpy(&mask, CMSG_DATA(cmsg), sizeof(mask)); for (i = 0; i < opt.nr_tasks; ++i) { port = ntohs(tasks[i].dst_addr.sin_port); if (mask & RDS_CONG_MONITOR_MASK(port)) tasks[i].congested = 0; } } break; case RDS_CMSG_RDMA_DEST: if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie))) die("RDS_CMSG_RDMA_DEST data too small"); memcpy(cookie, CMSG_DATA(cmsg), sizeof(*cookie)); break; case RDS_CMSG_RDMA_STATUS: if (cmsg->cmsg_len < CMSG_LEN(sizeof(notify))) die("RDS_CMSG_RDMA_DEST data too small"); memcpy(¬ify, CMSG_DATA(cmsg), sizeof(notify)); rdma_mark_completed(tasks, notify.user_token, notify.status); break; } } return ret; } static int recv_one(int fd, struct task *tasks, struct options *opts, struct child_control *ctl) { char buf[max(opts->req_size, opts->ack_size)]; rds_rdma_cookie_t rdma_dest = 0; struct sockaddr_in sin; struct header hdr, in_hdr; struct timeval tstamp; struct task *t; uint16_t expect_index; int task_index; ssize_t ret; ret = recv_message(fd, buf, sizeof(buf), &rdma_dest, &sin, &tstamp, tasks); if (ret < 0) return ret; /* If we received only RDMA completions or cong updates, * ret will be 0 */ if (ret == 0) return 0; /* check the incoming sequence number */ task_index = ntohs(sin.sin_port) - opts->starting_port - 1; if (task_index >= opts->nr_tasks) die("received bad task index %u\n", task_index); t = &tasks[task_index]; /* make sure the incoming message's size matches its op */ decode_hdr(&in_hdr, (struct header *) buf); switch(in_hdr.op) { case OP_REQ: stat_inc(&ctl->cur[S_REQ_RX_BYTES], ret); if (ret != opts->req_size) die("req size %zd, not %u\n", ret, opts->req_size); expect_index = t->recv_index; break; case OP_ACK: stat_inc(&ctl->cur[S_ACK_RX_BYTES], ret); if (ret != opts->ack_size) die("ack size %zd, not %u\n", ret, opts->ack_size); /* This ACK should be for the oldest outstanding REQ */ expect_index = (t->send_index - t->pending + opts->req_depth) % opts->req_depth; break; default: die("unknown op %u\n", in_hdr.op); } /* * Verify that the incoming header indicates that this * is the next in-order message to us. We can't predict * op. */ hdr.op = in_hdr.op; hdr.seq = t->recv_seq; hdr.from_addr = sin.sin_addr.s_addr; hdr.from_port = sin.sin_port; hdr.to_addr = t->src_addr.sin_addr.s_addr; hdr.to_port = t->src_addr.sin_port; hdr.index = expect_index; if (check_hdr(buf, ret, &hdr)) die("header from %s:%u to id %u bogus\n", inet_ntoa(sin.sin_addr), htons(sin.sin_port), ntohs(t->src_addr.sin_port)); if (hdr.op == OP_ACK) { stat_inc(&ctl->cur[S_RTT_USECS], usec_sub(&tstamp, &t->send_time[expect_index])); t->pending -= 1; if (in_hdr.rdma_key) rdma_process_ack(fd, &in_hdr, ctl); } else { struct header *ack_hdr; /* Build the ACK header right away */ ack_hdr = &t->ack_header[t->recv_index]; build_header(t, ack_hdr, OP_ACK, t->recv_index); /* The RDMA is performed at the time the ACK * message is sent. We need to mirror all * RDMA related header fields in our response * anyway, so that's a good place for send_ack * to pick them up from. */ if (rdma_dest) in_hdr.rdma_key = rdma_dest; if (in_hdr.rdma_key) { rdma_validate(&in_hdr, opts); rdma_build_ack(ack_hdr, &in_hdr); } t->unacked += 1; t->recv_index = (t->recv_index + 1) % opts->req_depth; } t->recv_seq++; return ret; } static void run_child(pid_t parent_pid, struct child_control *ctl, struct options *opts, uint16_t id) { struct sockaddr_in sin; struct pollfd pfd; int fd; uint16_t i; ssize_t ret; struct task tasks[opts->nr_tasks]; struct timeval start; sin.sin_family = AF_INET; sin.sin_port = htons(opts->starting_port + 1 + id); sin.sin_addr.s_addr = htonl(opts->receive_addr); /* give main display thread a little edge? */ nice(5); memset(tasks, 0, sizeof(tasks)); for (i = 0; i < opts->nr_tasks; i++) { tasks[i].nr = i; tasks[i].src_addr = sin; tasks[i].dst_addr.sin_family = AF_INET; tasks[i].dst_addr.sin_addr.s_addr = htonl(opts->send_addr); tasks[i].dst_addr.sin_port = htons(opts->starting_port + 1 + i); tasks[i].send_time = alloca(opts->req_depth * sizeof(struct timeval)); tasks[i].rdma_req_key = alloca(opts->req_depth * sizeof(uint64_t)); tasks[i].rdma_inflight = alloca(opts->req_depth * sizeof(uint8_t)); tasks[i].rdma_buf = alloca(opts->req_depth * sizeof(uint64_t *)); tasks[i].local_buf = alloca(opts->req_depth * sizeof(uint64_t *)); tasks[i].ack_header = alloca(opts->req_depth * sizeof(struct header)); tasks[i].rdma_next_op = (i & 1)? RDMA_OP_READ : RDMA_OP_WRITE; } if (opts->rdma_size) alloc_rdma_buffers(tasks, opts); fd = rds_socket(opts, &sin); ctl->ready = 1; while (ctl->start.tv_sec == 0) { check_parent(parent_pid); sleep(1); } /* sleep until we're supposed to start */ gettimeofday(&start, NULL); if (tv_cmp(&start, &ctl->start) < 0) usleep(usec_sub(&ctl->start, &start)); sin.sin_family = AF_INET; pfd.fd = fd; pfd.events = POLLIN | POLLOUT; while (1) { struct task *t; int can_send; check_parent(parent_pid); ret = poll(&pfd, 1, -1); if (ret < 0) { if (errno == EINTR) continue; die_errno("poll failed"); } pfd.events = POLLIN; if (pfd.revents & POLLIN) { while (recv_one(fd, tasks, opts, ctl) >= 0) ; } /* keep the pipeline full */ can_send = !!(pfd.revents & POLLOUT); for (i = 0, t = tasks; i < opts->nr_tasks; i++, t++) { if (opt.use_cong_monitor && t->congested) continue; if (t->drain_rdmas) continue; if (send_anything(fd, t, opts, ctl, can_send) < 0) { pfd.events |= POLLOUT; /* If the send queue is full, we will see EAGAIN. * If a particular destination is congested, the * kernel will return ENOBUFS. In the former case, * there's no point in trying other destinations; * in the latter case we certainly want to try * sending to other tasks. * * It would be nice if we could map the congestion * map into user space :-) */ if (errno == ENOBUFS) t->congested = 1; else if (errno == EBADSLT) t->drain_rdmas = 1; else break; } } } } static struct child_control *start_children(struct options *opts) { struct child_control *ctl; pid_t parent = getpid(); pid_t pid; size_t len; uint32_t i; len = opts->nr_tasks * sizeof(*ctl); ctl = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, 0, 0); if (ctl == MAP_FAILED) die("mmap of %u child control structs failed", opts->nr_tasks); memset(ctl, 0, len); init_msg_pattern(opts); if (opts->rdma_key_o_meter) rdma_key_o_meter_init(opts->nr_tasks); for (i = 0; i < opts->nr_tasks; i++) { pid = fork(); if (pid == -1) die_errno("forking child nr %u failed", i); if (pid == 0) { opts->suppress_warnings = (i > 0); if (control_fd >= 0) { close(control_fd); control_fd = -1; } rdma_key_o_meter_set_self(i); run_child(parent, ctl + i, opts, i); exit(0); } ctl[i].pid = pid; } for (i = 0; i < opts->nr_tasks; i++) { if (ctl[i].ready) continue; pid = waitpid(-1, NULL, WNOHANG); if (pid) die("child %u (pid %u) exited\n", i, pid); sleep(1); i--; /* try this child again */ } return ctl; } static double avg(struct counter *ctr) { if (ctr->nr) return (double)ctr->sum / (double)ctr->nr; else return 0.0; } static double throughput(struct counter *disp) { return disp[S_REQ_TX_BYTES].sum + disp[S_REQ_RX_BYTES].sum + disp[S_ACK_TX_BYTES].sum + disp[S_ACK_RX_BYTES].sum; } static double throughput_rdma(struct counter *disp) { return disp[S_RDMA_WRITE_BYTES].sum + disp[S_RDMA_READ_BYTES].sum; } void stat_snapshot(struct counter *disp, struct child_control *ctl, uint16_t nr_tasks) { struct counter tmp[NR_STATS]; uint16_t i; uint16_t s; memset(disp, 0, sizeof(tmp)); for (i = 0; i < nr_tasks; i++) { memcpy(tmp, ctl[i].cur, sizeof(tmp)); for (s = 0; s < NR_STATS; s++) { disp[s].nr += tmp[s].nr - ctl[i].last[s].nr; disp[s].sum += tmp[s].sum - ctl[i].last[s].sum; disp[s].min = minz(tmp[s].min, ctl[i].last[s].min); disp[s].max = max(tmp[s].max, ctl[i].last[s].max); } memcpy(ctl[i].last, tmp, sizeof(tmp)); } } void stat_accumulate(struct counter *accum, const struct counter *cur) { uint16_t s; for (s = 0; s < NR_STATS; ++s, ++cur, ++accum) { accum->nr += cur->nr; accum->sum += cur->sum; accum->min = minz(accum->min, cur->min); accum->max = max(accum->max, cur->max); } } void stat_total(struct counter *disp, struct child_control *ctl, uint16_t nr_tasks) { uint16_t i; uint16_t s; memset(disp, 0, sizeof(struct counter) * NR_STATS); for (i = 0; i < nr_tasks; i++) { for (s = 0; s < NR_STATS; s++) { disp[s].nr += ctl[i].cur[s].nr; disp[s].sum += ctl[i].cur[s].sum; disp[s].min = minz(disp[s].min, ctl[i].cur[s].min); disp[s].max = max(disp[s].max, ctl[i].cur[s].max); } } } static double cpu_use(struct soak_control *soak_arr) { struct soak_control *soak; uint64_t capacity = 0; uint64_t soaked = 0; uint64_t this; if (soak_arr == NULL) return -1.0; for (soak = soak_arr; soak && soak->per_sec; soak++) { capacity += soak->per_sec; this = soak->counter; soaked += min(soak->per_sec, this - soak->last); soak->last = this; } return (double)(capacity - soaked) * 100 / (double)capacity; } static void get_stats(int initialize) { #define NTIMES 8 struct sys_stats { /* Where we spent out time */ unsigned long long times[NTIMES]; unsigned long long other; /* Interrupt count */ unsigned long long intr; }; static struct sys_stats prev, current; static int disable = 0; char buffer[2048]; FILE *fp; if (disable) return; if ((fp = fopen("/proc/stat", "r")) == NULL) { fprintf(stderr, "Cannot open /proc/stat (%s) - " "not printing cpu stats\n", strerror(errno)); disable = 1; return; } memset(¤t, 0, sizeof(current)); while (fgets(buffer, sizeof(buffer), fp)) { if (!strncmp(buffer, "cpu ", 4)) { char *s = buffer + 4; int j; for (j = 0; 1; ++j) { unsigned long long v; while (*s == ' ') ++s; if (!isdigit(*s)) break; v = strtoull(s, &s, 10); if (j < NTIMES) current.times[j] = v; else current.other += v; } } else if (!strncmp(buffer, "intr ", 5)) { sscanf(buffer + 5, "%Lu", ¤t.intr); } } fclose(fp); if (initialize) { printf(",user:percent,system:percent,idle:percent" ",irq:percent,intr:count"); } else { struct sys_stats sys; unsigned long sum = 0; double scale; int j; sum = sys.other = current.other - prev.other; for (j = 0; j < NTIMES; ++j) { sys.times[j] = current.times[j] - prev.times[j]; sum += current.times[j]; } sys.intr = current.intr - prev.intr; scale = sum? 100.0 / sum : 0; /* Magic procfs offsets * 0 user * 1 nice * 2 system * 3 idle * 4 iowait * 5 irq * 6 softirq */ printf(",%f,%f,%f,%f,%Lu", (sys.times[0] + sys.times[1]) * scale, sys.times[2] * scale, (sys.times[3] + sys.times[4]) * scale, (sys.times[5] + sys.times[6]) * scale, sys.intr); } prev = current; } static void get_perfdata(int initialize) { static struct timeval last_ts, now; static struct rds_info_counter *prev, *ctr; static unsigned char *curr = NULL; static socklen_t buflen = 0; static int sock_fd = -1; int i, count, item_size; if (sock_fd < 0) { sock_fd = socket(PF_RDS, SOCK_SEQPACKET, 0); if (sock_fd < 0) die_errno("Unable to create socket"); } /* We should only loop once on the first call; after that the * buffer requirements for RDS counters should not change. */ while ((item_size = getsockopt(sock_fd, SOL_RDS, RDS_INFO_COUNTERS, curr, &buflen)) < 0) { if (errno != ENOSPC) die_errno("getsockopt(RDS_INFO_COUNTERS) failed"); curr = realloc(curr, buflen); if (!curr) die_errno("Cannot allocate buffer for stats counters"); } if (item_size > sizeof(*ctr)) die("Bad counter item size in RDS_INFO_COUNTERS (got %d, max %zd)\n", item_size, sizeof(*ctr)); count = buflen / item_size; if (prev == NULL) { /* First call - allocate buffer */ prev = calloc(count, sizeof(*ctr)); ctr = calloc(count, sizeof(*ctr)); } for (i = 0; i < count; ++i) memcpy(ctr + i, curr + i * item_size, item_size); gettimeofday(&now, NULL); if (initialize) { for (i = 0; i < count; ++i) { printf(",%s", ctr[i].name); if (strstr((char *) ctr[i].name, "_bytes")) printf(":bytes"); else printf(":count"); } } else { double scale; scale = 1e6 / usec_sub(&now, &last_ts); for (i = 0; i < count; ++i) { printf(",%f", (ctr[i].value - prev[i].value) * scale); } } memcpy(prev, ctr, count * sizeof(*ctr)); last_ts = now; get_stats(initialize); } static int reap_one_child(int wflags) { pid_t pid; int status; pid = waitpid(-1, &status, wflags); if (pid < 0) die("waitpid returned %u", pid); if (pid == 0) return 0; if (WIFEXITED(status)) { if (WEXITSTATUS(status) == 0) return 1; die("child pid %u exited with status %d\n", pid, WEXITSTATUS(status)); } if (WIFSIGNALED(status)) { if (WTERMSIG(status) == SIGTERM) return 1; die("child pid %u exited with signal %d\n", pid, WTERMSIG(status)); } die("child pid %u wait status %d\n", pid, status); } static void release_children_and_wait(struct options *opts, struct child_control *ctl, struct soak_control *soak_arr, int active) { struct counter disp[NR_STATS]; struct counter summary[NR_STATS]; struct timeval start, end, now, first_ts, last_ts; double cpu_total = 0; uint16_t i, cpu_samples = 0; uint16_t nr_running; gettimeofday(&start, NULL); start.tv_sec += 2; for (i = 0; i < opts->nr_tasks; i++) ctl[i].start = start; /* Allow for a 4 second delay: 2 seconds for the children * to come up, and 2 more of burn-in time */ printf("Starting up"); fflush(stdout); for (i = 0; i < 4; ++i) { sleep(1); stat_snapshot(disp, ctl, opts->nr_tasks); cpu_use(soak_arr); printf("."); fflush(stdout); } printf("\n"); gettimeofday(&first_ts, NULL); if (opts->run_time && active) { end = first_ts; end.tv_sec += opts->run_time; } else { timerclear(&end); } nr_running = opts->nr_tasks; memset(summary, 0, sizeof(summary)); if (opts->rtprio) set_rt_priority(); /* Prime the perf data counters and display the CSV header line * You can filter the CSV data from the rds-stress output by * grepping for the "::" marker. */ if (opt.show_perfdata) { printf("::"); printf("nr_tasks:count" ",req_size:bytes" ",ack_size:bytes" ",rdma_size:bytes"); printf(",req_sent:count" ",thruput:kB/s" ",thruput_rdma:kB/s" ",tx_delay:microseconds" ",rtt:microseconds" ",cpu:percent"); get_perfdata(1); printf("\n"); } else { printf("%4s %6s %10s %10s %7s %8s %5s\n", "tsks", "tx/s", "tx+rx K/s", "rw+rr K/s", "tx us/c", "rtt us", "cpu %"); } last_ts = first_ts; while (nr_running) { double cpu; if (active) { sleep(1); } else { struct pollfd pfd; pfd.fd = control_fd; pfd.events = POLLIN|POLLHUP; if (poll(&pfd, 1, 1000) == 1) break; } /* XXX big bug, need to mark some ctl elements dead */ stat_snapshot(disp, ctl, nr_running); gettimeofday(&now, NULL); cpu = cpu_use(soak_arr); if (!opts->summary_only) { double scale; /* Every loop takes a little more than one second; * and system load can actually introduce latencies. * So try to measure the actual time elapsed as precise * as possible, and scale all values by its inverse. */ scale = 1e6 / usec_sub(&now, &last_ts); if (!opt.show_perfdata) { printf("%4u %6"PRIu64" %10.2f %10.2f %7.2f %8.2f %5.2f\n", nr_running, disp[S_REQ_TX_BYTES].nr, scale * throughput(disp) / 1024.0, scale * throughput_rdma(disp) / 1024.0, scale * avg(&disp[S_SENDMSG_USECS]), scale * avg(&disp[S_RTT_USECS]), scale * cpu); } else { printf("::"); printf("%u,%u,%u,%u,", opts->nr_tasks, opts->req_size, opts->ack_size, opts->rdma_size); printf("%Lu,%f,%f,%f,%f,%f", (unsigned long long) disp[S_REQ_TX_BYTES].nr, scale * throughput(disp) / 1024.0, scale * throughput_rdma(disp) / 1024.0, scale * avg(&disp[S_SENDMSG_USECS]), scale * avg(&disp[S_RTT_USECS]), cpu >= 0? scale * cpu : 0); /* Print RDS perf counters etc */ get_perfdata(0); printf("\n"); } rdma_key_o_meter_check(opts->nr_tasks); } stat_accumulate(summary, disp); cpu_total += cpu; cpu_samples++; last_ts = now; if (timerisset(&end) && timercmp(&now, &end, >=)) break; /* see if any children have finished or died. * This is a bit touchy - we should really be * able to tell an exited soaker from an exiting * RDS child. */ if (reap_one_child(WNOHANG)) nr_running--; } close(control_fd); control_fd = -1; if (nr_running) { for (i = 0; i < opts->nr_tasks; i++) kill(ctl[i].pid, SIGTERM); stop_soakers(soak_arr); } while (nr_running && reap_one_child(0)) nr_running--; rdma_key_o_meter_check(opts->nr_tasks); stat_total(disp, ctl, opts->nr_tasks); if (!opts->summary_only) printf("---------------------------------------------\n"); { double scale; scale = 1e6 / usec_sub(&last_ts, &first_ts); printf("%4u %6lu %10.2f %10.2f %7.2f %8.2f %5.2f (average)\n", opts->nr_tasks, (long) (scale * summary[S_REQ_TX_BYTES].nr), scale * throughput(summary) / 1024.0, scale * throughput_rdma(disp) / 1024.0, avg(&summary[S_SENDMSG_USECS]), avg(&summary[S_RTT_USECS]), soak_arr? scale * cpu_total : -1.0); } } static void peer_connect(int fd, const struct sockaddr_in *sin) { int retries = 0; printf("connecting to %s:%u", inet_ntoa(sin->sin_addr), ntohs(sin->sin_port)); fflush(stdout); while (connect(fd, (struct sockaddr *) sin, sizeof(*sin))) { if (retries == 0) printf(": %s", strerror(errno)); switch (errno) { case ECONNREFUSED: case EHOSTUNREACH: case ENETUNREACH: if (retries >= opt.connect_retries) break; if (retries++ == 0) printf(" - retrying"); printf("."); fflush(stdout); sleep(1); continue; } printf("\n"); die("connect(%s) failed", inet_ntoa(sin->sin_addr)); } printf("\n"); } static void peer_send(int fd, const void *ptr, size_t size) { ssize_t ret; while (size) { ret = write(fd, ptr, size); if (ret < 0) die_errno("Cannot send to peer"); size -= ret; ptr += ret; } } static void peer_recv(int fd, void *ptr, size_t size) { ssize_t ret; while (size) { ret = read(fd, ptr, size); if (ret < 0) die_errno("Cannot recv from peer"); if (ret == 0) die("Peer unexpectedly closed connection\n"); size -= ret; ptr += ret; } } static void encode_options(struct options *dst, const struct options *src) { dst->req_depth = htonl(src->req_depth); dst->req_size = htonl(src->req_size); dst->ack_size = htonl(src->ack_size); dst->rdma_size = htonl(src->rdma_size); dst->send_addr = htonl(src->send_addr); /* host byte order */ dst->receive_addr = htonl(src->receive_addr); /* host byte order */ dst->starting_port = htons(src->starting_port); /* host byte order */ dst->nr_tasks = htons(src->nr_tasks); dst->run_time = htonl(src->run_time); dst->summary_only = src->summary_only; /* byte sized */ dst->rtprio = src->rtprio; /* byte sized */ dst->tracing = src->tracing; /* byte sized */ dst->verify = src->verify; /* byte sized */ dst->show_params = src->show_params; /* byte sized */ dst->show_perfdata = src->show_perfdata; /* byte sized */ dst->use_cong_monitor = src->use_cong_monitor; /* byte sized */ dst->rdma_use_once = src->rdma_use_once; /* byte sized */ dst->rdma_use_get_mr = src->rdma_use_get_mr; /* byte sized */ dst->rdma_use_fence = src->rdma_use_fence; /* byte sized */ dst->rdma_cache_mrs = src->rdma_cache_mrs; /* byte sized */ dst->rdma_key_o_meter = src->rdma_key_o_meter; /* byte sized */ dst->rdma_alignment = htonl(src->rdma_alignment); dst->connect_retries = htonl(src->connect_retries); dst->suppress_warnings = src->suppress_warnings;/* byte sized */ } static void decode_options(struct options *dst, const struct options *src) { dst->req_depth = ntohl(src->req_depth); dst->req_size = ntohl(src->req_size); dst->ack_size = ntohl(src->ack_size); dst->rdma_size = ntohl(src->rdma_size); dst->send_addr = ntohl(src->send_addr); /* host byte order */ dst->receive_addr = ntohl(src->receive_addr); /* host byte order */ dst->starting_port = ntohs(src->starting_port); /* host byte order */ dst->nr_tasks = ntohs(src->nr_tasks); dst->run_time = ntohl(src->run_time); dst->summary_only = src->summary_only; /* byte sized */ dst->rtprio = src->rtprio; /* byte sized */ dst->tracing = src->tracing; /* byte sized */ dst->verify = src->verify; /* byte sized */ dst->show_params = src->show_params; /* byte sized */ dst->show_perfdata = src->show_perfdata; /* byte sized */ dst->use_cong_monitor = src->use_cong_monitor; /* byte sized */ dst->rdma_use_once = src->rdma_use_once; /* byte sized */ dst->rdma_use_get_mr = src->rdma_use_get_mr; /* byte sized */ dst->rdma_use_fence = src->rdma_use_fence; /* byte sized */ dst->rdma_cache_mrs = src->rdma_cache_mrs; /* byte sized */ dst->rdma_key_o_meter = src->rdma_key_o_meter; /* byte sized */ dst->rdma_alignment = ntohl(src->rdma_alignment); dst->connect_retries = ntohl(src->connect_retries); dst->suppress_warnings = src->suppress_warnings;/* byte sized */ } static void verify_option_encdec(const struct options *opts) { struct options ebuf, dbuf; unsigned int i; memcpy(&dbuf, opts, sizeof(*opts)); for (i = 0; i < sizeof(*opts); ++i) { unsigned char *x = &((unsigned char *) &dbuf)[i]; *x = ~*x; } encode_options(&ebuf, opts); decode_options(&dbuf, &ebuf); if (memcmp(&dbuf, opts, sizeof(*opts))) die("encode/decode check of options struct failed"); } static int active_parent(struct options *opts, struct soak_control *soak_arr) { struct options enc_options; struct child_control *ctl; struct sockaddr_in sin; int fd; uint8_t ok; if (opts->show_params) { unsigned int k; printf("Options:\n" " %-10s %-7u\n" " %-10s %-7u\n" " %-10s %-7u\n" " %-10s %-7u\n", "Tasks", opts->nr_tasks, "Req size", opts->req_size, "ACK size", opts->ack_size, "RDMA size", opts->rdma_size); k = 0; printf(" %-10s", "RDMA opts"); if (opts->rdma_use_once) { printf(" use_once"); ++k; } if (opts->rdma_use_get_mr) { printf(" use_get_mr"); ++k; } if (opts->rdma_use_fence) { printf(" use_fence"); ++k; } if (opts->rdma_cache_mrs) { printf(" cache_mrs"); ++k; } if (opts->rdma_alignment) { printf(" align=%u", opts->rdma_alignment); ++k; } if (!k) printf(" (defaults)"); printf("\n"); printf("\n"); } /* Make sure that when we add new options, we don't forget * to add them to the encode/decode routines. */ verify_option_encdec(opts); sin.sin_family = AF_INET; sin.sin_port = htons(opts->starting_port); sin.sin_addr.s_addr = htonl(opts->receive_addr); fd = bound_socket(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sin); control_fd = fd; sin.sin_family = AF_INET; sin.sin_port = htons(opts->starting_port); sin.sin_addr.s_addr = htonl(opts->send_addr); peer_connect(fd, &sin); if (opts->receive_addr == 0) { opts->receive_addr = get_local_address(fd, &sin); if (opts->rdma_size && !check_rdma_support(opts)) die("RDMA not supported by this kernel\n"); } /* "negotiation" is overstating things a bit :-) * We just tell the peer what options to use. */ encode_options(&enc_options, opts); peer_send(fd, &enc_options, sizeof(struct options)); printf("negotiated options, tasks will start in 2 seconds\n"); ctl = start_children(opts); /* Tell the peer to start up. This is necessary when testing * with a large number of tasks, because otherwise the peer * may start sending before we have all our tasks running. */ peer_send(fd, &ok, sizeof(ok)); peer_recv(fd, &ok, sizeof(ok)); release_children_and_wait(opts, ctl, soak_arr, 1); return 0; } static int passive_parent(uint32_t addr, uint16_t port, struct soak_control *soak_arr) { struct options remote, *opts; struct child_control *ctl; struct sockaddr_in sin; socklen_t socklen; int lfd, fd; uint8_t ok; sin.sin_family = AF_INET; sin.sin_port = htons(port); sin.sin_addr.s_addr = htonl(addr); lfd = bound_socket(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sin); if (listen(lfd, 255)) die_errno("listen() failed"); socklen = sizeof(sin); fd = accept(lfd, (struct sockaddr *)&sin, &socklen); if (fd < 0) die_errno("accept() failed"); control_fd = fd; /* Do not accept any further connections - we don't handle them * anyway. */ close(lfd); printf("accepted connection from %s:%u", inet_ntoa(sin.sin_addr), ntohs(sin.sin_port)); if (addr == 0) { /* Get our receive address - i.e. the address the peer connected to. */ addr = get_local_address(control_fd, &sin); printf(" on %s:%u", inet_ntoa(sin.sin_addr), ntohs(sin.sin_port)); } printf("\n"); peer_recv(fd, &remote, sizeof(struct options)); decode_options(&remote, &remote); opts = &remote; /* * The sender gave us their send and receive addresses, we need * to swap them. */ opts->send_addr = opts->receive_addr; opts->receive_addr = addr; opt = *opts; ctl = start_children(opts); /* Wait for "GO" from the initiating peer */ peer_recv(fd, &ok, sizeof(ok)); peer_send(fd, &ok, sizeof(ok)); printf("negotiated options, tasks will start in 2 seconds\n"); release_children_and_wait(opts, ctl, soak_arr, 0); return 0; } /* * The soaker *constantly* spins calling getpid(). It tries to execute a * second's worth of calls before checking that it's parent is still alive. It * uses gettimeofday() to figure out the per-second rate of the series it just * executed. It always tries to work from the highest rate it ever saw. */ static void run_soaker(pid_t parent_pid, struct soak_control *soak) { uint64_t i; uint64_t per_sec; struct timeval start; struct timeval stop; uint64_t usecs; nice(20); soak->per_sec = 1000; while (1) { gettimeofday(&start, NULL); for (i = 0; i < soak->per_sec; i++) { syscall(SYS_getpid); soak->counter++; } gettimeofday(&stop, NULL); usecs = usec_sub(&stop, &start); per_sec = (double)soak->per_sec * 1000000.0 / (double)usecs; if (per_sec > soak->per_sec) soak->per_sec = per_sec; check_parent(parent_pid); } } struct soak_control *start_soakers(void) { struct soak_control *soak_arr; pid_t parent = getpid(); pid_t pid; size_t len; long nr_soak = sysconf(_SC_NPROCESSORS_ONLN); long i; /* an extra terminating entry which will be all 0s */ len = (nr_soak + 1) * sizeof(struct soak_control); soak_arr = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, 0, 0); if (soak_arr == MAP_FAILED) die("mmap of %ld soak control structs failed", nr_soak); memset(soak_arr, 0, len); printf("started %ld cycle soaking processes\n", nr_soak); for (i = 0; i < nr_soak; i++) { pid = fork(); if (pid == -1) die_errno("forking soaker nr %lu failed", i); if (pid == 0) { run_soaker(parent, soak_arr + i); exit(0); } soak_arr[i].pid = pid; } return soak_arr; } void stop_soakers(struct soak_control *soak_arr) { unsigned int i, nr_soak = sysconf(_SC_NPROCESSORS_ONLN); if (!soak_arr) return; for (i = 0; i < nr_soak; ++i) { kill(soak_arr[i].pid, SIGTERM); waitpid(soak_arr[i].pid, NULL, 0); } } void check_size(uint32_t size, uint32_t unspec, uint32_t max, char *desc, char *opt) { if (size == ~0) die("specify %s with %s\n", desc, opt); if (size < max) die("%s must be at least %u bytes\n", desc, max); } enum { OPT_RDMA_USE_ONCE = 0x100, OPT_RDMA_USE_GET_MR, OPT_RDMA_USE_FENCE, OPT_RDMA_USE_NOTIFY, OPT_RDMA_CACHE_MRS, OPT_RDMA_ALIGNMENT, OPT_RDMA_KEY_O_METER, OPT_SHOW_PARAMS, OPT_CONNECT_RETRIES, OPT_USE_CONG_MONITOR, OPT_PERFDATA, }; static struct option long_options[] = { { "req-bytes", required_argument, NULL, 'q' }, { "ack-bytes", required_argument, NULL, 'a' }, { "rdma-bytes", required_argument, NULL, 'D' }, { "tasks", required_argument, NULL, 't' }, { "depth", required_argument, NULL, 'd' }, { "recv-addr", required_argument, NULL, 'r' }, { "send-addr", required_argument, NULL, 's' }, { "port", required_argument, NULL, 'p' }, { "time", required_argument, NULL, 'T' }, { "report-cpu", no_argument, NULL, 'c' }, { "report-summary", no_argument, NULL, 'z' }, { "rtprio", no_argument, NULL, 'R' }, { "verify", no_argument, NULL, 'v' }, { "trace", no_argument, NULL, 'V' }, { "rdma-use-once", required_argument, NULL, OPT_RDMA_USE_ONCE }, { "rdma-use-get-mr", required_argument, NULL, OPT_RDMA_USE_GET_MR }, { "rdma-use-fence", required_argument, NULL, OPT_RDMA_USE_FENCE }, { "rdma-use-notify", required_argument, NULL, OPT_RDMA_USE_NOTIFY }, { "rdma-cache-mrs", required_argument, NULL, OPT_RDMA_CACHE_MRS }, { "rdma-alignment", required_argument, NULL, OPT_RDMA_ALIGNMENT }, { "rdma-key-o-meter", no_argument, NULL, OPT_RDMA_KEY_O_METER }, { "show-params", no_argument, NULL, OPT_SHOW_PARAMS }, { "show-perfdata", no_argument, NULL, OPT_PERFDATA }, { "connect-retries", required_argument, NULL, OPT_CONNECT_RETRIES }, { "use-cong-monitor", required_argument, NULL, OPT_USE_CONG_MONITOR }, { NULL } }; int main(int argc, char **argv) { struct options opts; struct soak_control *soak_arr = NULL; #ifdef DYNAMIC_PF_RDS /* Discover PF_RDS/SOL_RDS once, and be done with it */ (void) discover_pf_rds(); (void) discover_sol_rds(); #endif #ifdef _SC_PAGESIZE sys_page_size = sysconf(_SC_PAGESIZE); #else sys_page_size = 4096; #endif /* We really want to see output when we redirect * stdout to a pipe. */ setlinebuf(stdout); memset(&opts, 0xff, sizeof(opts)); opts.receive_addr = 0; opts.starting_port = 4000; opts.ack_size = MIN_MSG_BYTES; opts.req_size = 1024; opts.run_time = 0; opts.summary_only = 0; opts.rtprio = 0; opts.tracing = 0; opts.verify = 0; opts.rdma_size = 0; opts.use_cong_monitor = 1; opts.rdma_use_fence = 1; opts.rdma_cache_mrs = 0; opts.rdma_alignment = 0; opts.rdma_key_o_meter = 0; opts.show_params = 0; opts.connect_retries = 0; opts.show_perfdata = 0; while(1) { int c, index; c = getopt_long(argc, argv, "+a:cD:d:hp:q:Rr:s:t:T:vVz", long_options, &index); if (c == -1) break; switch(c) { case 'a': opts.ack_size = parse_ull(optarg, (uint32_t)~0); break; case 'c': soak_arr = start_soakers(); break; case 'D': opts.rdma_size = parse_ull(optarg, (uint32_t)~0); break; case 'd': opts.req_depth = parse_ull(optarg,(uint32_t)~0); break; case 'p': opts.starting_port = parse_ull(optarg, (uint16_t)~0); break; case 'q': opts.req_size = parse_ull(optarg, (uint32_t)~0); break; case 'R': opts.rtprio = 1; break; case 'r': opts.receive_addr = parse_addr(optarg); break; case 's': opts.send_addr = parse_addr(optarg); break; case 't': opts.nr_tasks = parse_ull(optarg, (uint16_t)~0); break; case 'T': opts.run_time = parse_ull(optarg, (uint32_t)~0); break; case 'z': opts.summary_only = 1; break; case 'v': opts.verify = 1; break; case 'V': opts.tracing = 1; break; case OPT_USE_CONG_MONITOR: opts.use_cong_monitor = parse_ull(optarg, 1); break; case OPT_RDMA_USE_ONCE: opts.rdma_use_once = parse_ull(optarg, 1); break; case OPT_RDMA_USE_GET_MR: opts.rdma_use_get_mr = parse_ull(optarg, 1); break; case OPT_RDMA_USE_FENCE: opts.rdma_use_fence = parse_ull(optarg, 1); break; case OPT_RDMA_CACHE_MRS: opts.rdma_cache_mrs = parse_ull(optarg, 1); break; case OPT_RDMA_USE_NOTIFY: (void) parse_ull(optarg, 1); break; case OPT_RDMA_ALIGNMENT: opts.rdma_alignment = parse_ull(optarg, sys_page_size); break; case OPT_RDMA_KEY_O_METER: opts.rdma_key_o_meter = 1; break; case OPT_SHOW_PARAMS: opts.show_params = 1; break; case OPT_CONNECT_RETRIES: opts.connect_retries = parse_ull(optarg, (uint32_t)~0); break; case OPT_PERFDATA: opts.show_perfdata = 1; break; case 'h': case '?': default: usage(); break; } } if (opts.rdma_use_once == 0xff) opts.rdma_use_once = !opts.rdma_cache_mrs; else if (opts.rdma_cache_mrs && opts.rdma_use_once) die("option --rdma-cache-mrs conflicts with --rdma-use-once\n"); if (opts.rdma_use_get_mr == 0xff) opts.rdma_use_get_mr = opts.rdma_cache_mrs; else if (opts.rdma_cache_mrs && !opts.rdma_use_get_mr) die("option --rdma-cache-mrs conflicts with --rdma-use-get-mr=0\n"); /* the passive parent will read options off the wire */ if (opts.send_addr == ~0) return passive_parent(opts.receive_addr, opts.starting_port, soak_arr); /* the active parent verifies and sends its options */ check_size(opts.ack_size, ~0, MIN_MSG_BYTES, "ack size", "-a"); check_size(opts.req_size, ~0, MIN_MSG_BYTES, "req size", "-q"); /* defaults */ if (opts.req_depth == ~0) opts.req_depth = 1; if (opts.nr_tasks == (uint16_t)~0) opts.nr_tasks = 1; if (opts.rdma_size && !check_rdma_support(&opts)) die("RDMA not supported by this kernel\n"); /* We require RDMA to be multiples of the page size for now. * this is just to simplify debugging, but eventually we * need to support rdma sizes from 1 to 1meg byte */ if (opts.rdma_size && 0) opts.rdma_size = (opts.rdma_size + 4095) & ~4095; opt = opts; return active_parent(&opts, soak_arr); } /* * This are completely stupid. options.c should be removed. */ void print_usage(int durr) { } void print_version() { } trunk/examples/0000755000175000017500000000000011313644724013366 5ustar benoitbenoittrunk/examples/Makefile0000644000175000017500000000007711313644724015032 0ustar benoitbenoit all: rds-sample rds-sample: rds-sample.o CFLAGS = -I ../net trunk/examples/README0000644000175000017500000000036011313644724014245 0ustar benoitbenoitThe source in this directory is meant to serve as an aid for becoming familiar with RDS socket programming. Questions about this or other rds-tools code are welcomed on the rds-devel list: http://oss.oracle.com/mailman/listinfo/rds-devel trunk/examples/rds-sample.c0000644000175000017500000001732711313644724015613 0ustar benoitbenoit/* * Copyright (c) 2008 Chelsio, Inc. All rights reserved. * * Author: Jon Mason * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include /* FIXME - this is a hack to getaround RDS not exporting any header files. * This is a local copy. */ #include "ib_rds.h" /* These are defined in rds.h....but that file is not happily included */ #define SOL_RDS 272 #define PF_RDS 28 #define TESTPORT 4000 #define BUFSIZE 94 static int do_rdma_read(int sock, struct msghdr *msg, void *buf) { struct rds_rdma_args *args; struct rds_iovec iov; struct cmsghdr *cmsg; int rc; cmsg = CMSG_FIRSTHDR(msg); args = (struct rds_rdma_args *)CMSG_DATA(cmsg); /* Do a sendmsg call to preform the RDMA */ cmsg->cmsg_level = SOL_RDS; cmsg->cmsg_type = RDS_CMSG_RDMA_ARGS; cmsg->cmsg_len = CMSG_LEN(sizeof(struct rds_rdma_args)); iov.addr = (uint64_t) buf; iov.bytes = BUFSIZE * sizeof(char); args->remote_vec.addr = 0; args->remote_vec.bytes = BUFSIZE * sizeof(char); args->local_vec_addr = (uint64_t) &iov; args->nr_local = 1; args->flags = RDS_RDMA_NOTIFY_ME; args->user_token = 0; msg->msg_controllen = CMSG_SPACE(sizeof(struct rds_rdma_args)); rc = sendmsg(sock, msg, 0); if (rc < 0) { printf("%s: Error sending message: %d %d\n", __func__, rc, errno); return -1; } sleep(1); rc = recvmsg(sock, msg, 0); if (rc < 0) { printf("%s: Error receiving message: %d %d\n", __func__, rc, errno); return -1; } return 0; } static void server(char *address) { struct sockaddr_in sin, din; void *buf, *ctlbuf; struct msghdr msg; struct iovec *iov; int rc, sock; buf = calloc(BUFSIZE, sizeof(char)); if (!buf) { printf("%s: calloc failed\n", __func__); return; } sock = socket(PF_RDS, SOCK_SEQPACKET, 0); if (sock < 0) { printf("%s: Error creating Socket: %d\n", __func__, sock); goto out; } memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_addr.s_addr = inet_addr(address); sin.sin_port = TESTPORT; rc = bind(sock, (struct sockaddr *)&sin, sizeof(sin)); if (rc < 0) { printf("%s: Error binding to address: %d %d\n", __func__, rc, errno); goto out; } /* The recv iov could contain a regular RDS packet or an RDMA RDS * packet, so set it up for the worst case for both. */ iov = calloc(1, sizeof(struct iovec)); if (!iov) { printf("%s: calloc failed\n", __func__); goto out; } ctlbuf = calloc(1, sizeof(struct rds_rdma_args)); if (!ctlbuf) { printf("%s: calloc failed\n", __func__); goto out1; } iov[0].iov_base = buf; iov[0].iov_len = BUFSIZE * sizeof(char); memset(&msg, 0, sizeof(msg)); msg.msg_name = &din; msg.msg_namelen = sizeof(din); msg.msg_iov = iov; msg.msg_iovlen = 1; msg.msg_control = ctlbuf; msg.msg_controllen = CMSG_SPACE(sizeof(struct rds_rdma_args)); printf("server listening on %s\n", inet_ntoa(sin.sin_addr)); rc = recvmsg(sock, &msg, 0); if (rc < 0) { printf("%s: Error receiving message: %d %d\n", __func__, rc, errno); goto out2; } printf("Received a packet len %d, cmsg len %d, on port %d\n", (uint32_t) iov[0].iov_len, (uint32_t) msg.msg_controllen, din.sin_port); if (msg.msg_controllen) { rc = do_rdma_read(sock, &msg, buf); if (rc < 0) goto out2; } printf("payload contains: %s\n", (char *)buf); out2: free(ctlbuf); out1: free(iov); out: free(buf); } static void create_message(char *buf) { int i; for (i = 0; i < BUFSIZE; i++) buf[i] = i + 0x21; } static int build_rds_rdma_packet(int sock, struct msghdr *msg, void *buf, uint64_t *cookie) { struct rds_get_mr_args mr_args; struct cmsghdr *cmsg; void *ctlbuf; mr_args.vec.addr = (uint64_t) buf; mr_args.vec.bytes = BUFSIZE * sizeof(char); mr_args.cookie_addr = (uint64_t) cookie; mr_args.flags = RDS_RDMA_READWRITE; ctlbuf = calloc(1, CMSG_SPACE(sizeof(mr_args))); if (!ctlbuf) { printf("%s: calloc failed\n", __func__); return -1; } msg->msg_control = ctlbuf; msg->msg_controllen = CMSG_SPACE(sizeof(mr_args)); cmsg = CMSG_FIRSTHDR(msg); cmsg->cmsg_level = SOL_RDS; cmsg->cmsg_type = RDS_CMSG_RDMA_MAP; cmsg->cmsg_len = CMSG_LEN(sizeof(mr_args)); memcpy(CMSG_DATA(cmsg), &mr_args, sizeof(mr_args)); msg->msg_iov = NULL; msg->msg_iovlen = 0; return 0; } static int build_rds_packet(struct msghdr *msg, char *buf) { struct iovec *iov; iov = calloc(1, sizeof(struct iovec)); if (!iov) { printf("%s: calloc failed\n", __func__); return -1; } msg->msg_iov = iov; msg->msg_iovlen = 1; iov[0].iov_base = buf; iov[0].iov_len = BUFSIZE * sizeof(char); return 0; } static void client(char *localaddr, char *remoteaddr, int rdma) { struct sockaddr_in sin, din; struct msghdr msg; uint64_t cookie = 0; int rc, sock; void *buf; buf = calloc(BUFSIZE, sizeof(char)); if (!buf) { printf("%s: calloc failed\n", __func__); return; } create_message((char *)buf); sock = socket(PF_RDS, SOCK_SEQPACKET, 0); if (sock < 0) { printf("%s: Error creating Socket: %d\n", __func__, sock); goto out; } memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_addr.s_addr = inet_addr(localaddr); rc = bind(sock, (struct sockaddr *)&sin, sizeof(sin)); if (rc < 0) { printf("%s: Error binding to address: %d %d\n", __func__, rc, errno); goto out; } memset(&msg, 0, sizeof(msg)); msg.msg_name = &din; msg.msg_namelen = sizeof(din); memset(&din, 0, sizeof(din)); din.sin_family = AF_INET; din.sin_addr.s_addr = inet_addr(remoteaddr); din.sin_port = TESTPORT; if (rdma) { rc = build_rds_rdma_packet(sock, &msg, buf, &cookie); if (rc < 0) goto out; printf("Client Sending RDMA message from %s to %s\n", localaddr, remoteaddr); } else { rc = build_rds_packet(&msg, buf); if (rc < 0) goto out; printf("client sending %d byte message %s from %s to %s on port %d\n", (uint32_t) msg.msg_iov->iov_len, (char *)buf, localaddr, remoteaddr, sin.sin_port); } rc = sendmsg(sock, &msg, 0); if (rc < 0) { printf("%s: Error sending message: %d %d\n", __func__, rc, errno); goto out1; } if (rdma) { /* reuse the same msg, as it should no longer be necessary and this incoming * msg should be empty */ rc = recvmsg(sock, &msg, 0); if (rc < 0) { printf("%s: Error receiving message: %d %d\n", __func__, rc, errno); } } out1: if (msg.msg_control) free(msg.msg_control); if (msg.msg_iov) free(msg.msg_iov); out: free(buf); } int main(int argc, char **argv) { char *serveraddr = NULL, *clientaddr = NULL; int i, rdma = 0; if (argc < 3) { printf("not enough args\n"); return -1; } for (i = 1; i < argc; i++) { if (!strcmp("-s", argv[i])) { serveraddr = argv[i+1]; i++; } else if (!strcmp("-c", argv[i])) { clientaddr = argv[i+1]; i++; } else if (!strcmp("--rdma", argv[i])) { rdma = 1; } else printf("Invalid param\n"); } if (serveraddr && !clientaddr) { server(serveraddr); } else if (serveraddr && clientaddr) { client(clientaddr, serveraddr, rdma); } return 0; } trunk/rds-ping.10000644000175000017500000000342511313644724013361 0ustar benoitbenoit.Dd Apr 22, 2008 .Dt RDS-PING 1 .Os .Sh NAME .Nm rds-ping .Nd test reachability of remote node over RDS .Pp .Sh SYNOPSIS .Nm rds-ping .Bk -words .Op Fl c Ar count .Op Fl i Ar interval .Op Fl I Ar local_addr .Ar remote_addr .Sh DESCRIPTION .Nm rds-ping is used to test whether a remote node is reachable over RDS. Its interface is designed to operate pretty much the standard .Xr ping 8 utility, even though the way it works is pretty different. .Pp .Nm rds-ping opens several RDS sockets and sends packets to port 0 on the indicated host. This is a special port number to which no socket is bound; instead, the kernel processes incoming packets and responds to them. .Sh OPTIONS The following options are available for use on the command line: .Bl -tag -width Ds .It Fl c Ar count Causes .Nm rds-ping to exit after sending (and receiving) the specified number of packets. .It Fl I Ar address By default, .Nm rds-ping will pick the local source address for the RDS socket based on routing information for the destination address (i.e. if packets to the given destination would be routed through interface .Nm ib0 , then it will use the IP address of .Nm ib0 as source address). Using the .Fl I option, you can override this choice. .It Fl i Ar timeout By default, .Nm rds-ping will wait for one second between sending packets. Use this option to specified a different interval. The timeout value is given in seconds, and can be a floating point number. Optionally, append .Nm msec or .Nm usec to specify a timeout in milliseconds or microseconds, respectively. .It Specifying a timeout considerably smaller than the packet round-trip time will produce unexpected results. .El .Sh AUTHORS .Nm rds-ping was written by Olaf Kirch . .Sh SEE ALSO .Xr rds 7 , .Xr rds-info 1 , .Xr rds-stress 1 . trunk/rds-ping.c0000644000175000017500000002132711313644724013444 0ustar benoitbenoit/* * rds-ping utility * * Test reachability of a remote RDS node by sending a packet to port 0. * * Copyright (C) 2008 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "net/rds.h" #ifdef DYNAMIC_PF_RDS #include "pfhack.h" #endif #define die(fmt...) do { \ fprintf(stderr, fmt); \ exit(1); \ } while (0) #define die_errno(fmt, args...) do { \ fprintf(stderr, fmt ", errno: %d (%s)\n", ##args , errno,\ strerror(errno)); \ exit(1); \ } while (0) static struct timeval opt_wait = { 1, 1 }; /* 1s */ static unsigned long opt_count; static struct in_addr opt_srcaddr; static struct in_addr opt_dstaddr; /* For reasons of simplicity, RDS ping does not use a packet * payload that is being echoed, the way ICMP does. * Instead, we open a number of sockets on different ports, and * match packet sequence numbers with ports. */ #define NSOCKETS 8 struct socket { int fd; unsigned int sent_id; struct timeval sent_ts; unsigned int nreplies; }; static int do_ping(void); static void report_packet(struct socket *sp, const struct timeval *now, const struct in_addr *from, int err); static void usage(const char *complaint); static int rds_socket(struct in_addr *src, struct in_addr *dst); static int parse_timeval(const char *, struct timeval *); static int parse_long(const char *ptr, unsigned long *); static int parse_addr(const char *ptr, struct in_addr *); int main(int argc, char **argv) { int c; while ((c = getopt(argc, argv, "c:i:I:")) != -1) { switch (c) { case 'c': if (!parse_long(optarg, &opt_count)) die("Bad packet count <%s>\n", optarg); break; case 'I': if (!parse_addr(optarg, &opt_srcaddr)) die("Unknown source address <%s>\n", optarg); break; case 'i': if (!parse_timeval(optarg, &opt_wait)) die("Bad wait time <%s>\n", optarg); break; default: usage("Unknown option"); } } if (optind + 1 != argc) usage("Missing destination address"); if (!parse_addr(argv[optind], &opt_dstaddr)) die("Cannot parse destination address <%s>\n", argv[optind]); return do_ping(); } /* returns a - b in usecs */ static inline long usec_sub(const struct timeval *a, const struct timeval *b) { return ((long)(a->tv_sec - b->tv_sec) * 1000000UL) + a->tv_usec - b->tv_usec; } static int do_ping(void) { struct sockaddr_in sin; unsigned int sent = 0, recv = 0; struct timeval next_ts; struct socket socket[NSOCKETS]; struct pollfd pfd[NSOCKETS]; int i, next = 0; for (i = 0; i < NSOCKETS; ++i) { int fd; fd = rds_socket(&opt_srcaddr, &opt_dstaddr); socket[i].fd = fd; pfd[i].fd = fd; pfd[i].events = POLLIN; } memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_addr = opt_dstaddr; gettimeofday(&next_ts, NULL); while (1) { struct timeval now; struct sockaddr_in from; socklen_t alen = sizeof(from); long deadline; int ret; /* Fast way out - if we have received all packets, bail now. * If we're still waiting for some to come back, we need * to do the poll() below */ if (opt_count && recv >= opt_count) break; gettimeofday(&now, NULL); if (timercmp(&now, &next_ts, >=)) { struct socket *sp = &socket[next]; int err = 0; if (opt_count && sent >= opt_count) break; timeradd(&next_ts, &opt_wait, &next_ts); if (sendto(sp->fd, NULL, 0, 0, (struct sockaddr *) &sin, sizeof(sin))) err = errno; sp->sent_id = ++sent; sp->sent_ts = now; sp->nreplies = 0; next = (next + 1) % NSOCKETS; if (err) { static unsigned int nerrs = 0; report_packet(sp, NULL, NULL, err); if (err == EINVAL && nerrs++ == 0) printf(" Maybe your kernel does not support rds ping yet\n"); } } deadline = usec_sub(&next_ts, &now); ret = poll(pfd, NSOCKETS, deadline / 1000); if (ret < 0) { if (errno == EINTR) continue; die_errno("poll"); } if (ret == 0) continue; for (i = 0; i < NSOCKETS; ++i) { struct socket *sp = &socket[i]; if (!(pfd[i].revents & POLLIN)) continue; ret = recvfrom(sp->fd, NULL, 0, MSG_DONTWAIT, (struct sockaddr *) &from, &alen); gettimeofday(&now, NULL); if (ret < 0) { if (errno != EAGAIN && errno != EINTR) report_packet(sp, &now, NULL, errno); } else { report_packet(sp, &now, &from.sin_addr, 0); recv++; } } } /* Program exit code: signal success if we received any response. */ return recv == 0; } static void report_packet(struct socket *sp, const struct timeval *now, const struct in_addr *from_addr, int err) { printf(" %3u:", sp->sent_id); if (now) printf(" %ld usec", usec_sub(now, &sp->sent_ts)); if (from_addr && from_addr->s_addr != opt_dstaddr.s_addr) printf(" (%s)", inet_ntoa(*from_addr)); if (sp->nreplies) printf(" DUP!"); if (err) printf(" ERROR: %s", strerror(err)); printf("\n"); sp->nreplies++; } static int rds_socket(struct in_addr *src, struct in_addr *dst) { struct sockaddr_in sin; int fd; memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; fd = socket(PF_RDS, SOCK_SEQPACKET, 0); if (fd < 0) die_errno("unable to create RDS socket"); /* Guess the local source addr if not given. */ if (src->s_addr == 0) { socklen_t alen; int ufd; ufd = socket(PF_INET, SOCK_DGRAM, 0); if (ufd < 0) die_errno("unable to create UDP socket"); sin.sin_addr = *dst; sin.sin_port = htons(1); if (connect(ufd, (struct sockaddr *) &sin, sizeof(sin)) < 0) die_errno("unable to connect to %s", inet_ntoa(*dst)); alen = sizeof(sin); if (getsockname(ufd, (struct sockaddr *) &sin, &alen) < 0) die_errno("getsockname failed"); *src = sin.sin_addr; close(ufd); } sin.sin_addr = *src; sin.sin_port = 0; if (bind(fd, (struct sockaddr *) &sin, sizeof(sin))) die_errno("bind() failed"); return fd; } static void usage(const char *complaint) { fprintf(stderr, "%s\nUsage: rds-ping [options] dst_addr\n" "Options:\n" " -c count limit packet count\n" " -I interface source IP address\n", complaint); exit(1); } static int parse_timeval(const char *ptr, struct timeval *ret) { double seconds; char *endptr; seconds = strtod(ptr, &endptr); if (!strcmp(endptr, "ms") || !strcmp(endptr, "msec")) { seconds *= 1e-3; } else if (!strcmp(endptr, "us") || !strcmp(endptr, "usec")) { seconds *= 1e-6; } else if (*endptr) return 0; ret->tv_sec = (long) seconds; seconds -= ret->tv_sec; ret->tv_usec = (long) (seconds * 1e6); return 1; } static int parse_long(const char *ptr, unsigned long *ret) { unsigned long long val; char *endptr; val = strtoull(ptr, &endptr, 0); switch (*endptr) { case 'k': case 'K': val <<= 10; endptr++; break; case 'm': case 'M': val <<= 20; endptr++; break; case 'g': case 'G': val <<= 30; endptr++; break; } if (*endptr) return 0; *ret = val; return 1; } static int parse_addr(const char *ptr, struct in_addr *ret) { struct hostent *hent; hent = gethostbyname(ptr); if (hent && hent->h_addrtype == AF_INET && hent->h_length == sizeof(*ret)) { memcpy(ret, hent->h_addr, sizeof(*ret)); return 1; } return 0; } /* * This are completely stupid. options.c should be removed. */ void print_usage(int durr) { } void print_version() { } trunk/rds-rdma.70000644000175000017500000003175111313644724013360 0ustar benoitbenoit.TH "RDS zerocopy" 7 .SH NAME RDS-rdma \- Zerocopy Interface for RDMA over RDS .SH DESCRIPTION This manual page describes the zerocopy interface of RDS, which was added in RDSv3. For a description of the basic RDS interface, please refer to .BR rds (7). .PP The principal mode of operation for RDS zerocopy is like this: one participant (the client) wishes to initiate a direct transfer to or from some area of memory in its process address space. This memory does not have to be aligned. .PP The client obtains a handle for this region of memory, and passes it to the other participant (the server). This is called the RDMA cookie. To the application, the cookie is an opaque 64bit data type. .PP The client sends this handle to the server application, along with other details of the RDMA request (such as which data to transfer to that memory area). Throughout the following discussion, we will refer to this message as the RDMA request. .PP The server uses this RDMA cookie to initiate the requested RDMA transfer. The RDMA transfer is combined atomically with a normal RDS message, which is delivered to the client. This message is called the RDMA ACK throughout the following. Atomic in this context means that either both the RDMA succeeds and the RDMA ACK is delivered, or neither succeeds. .PP Thus, when the client receives the RDMA ACK, it knows that the RDMA has completed successfully. It can then release the RDMA cookie for this memory region, if it wishes to. .PP RDMA operations are not reliable, in the sense that unlike normal RDS messages, RDS RDMA operations may fail, and get dropped. .\"------------------------------- .SH INTERFACE The interface is currently based on control messages (ancillary data) sent or received via the .BR sendmsg (2) and .BR recvmsg (2) system calls. Optionally, an older interface can be used that is based on the .BR setsockopt (2) system call. However, we recommend using control messages, as this reduces the number of system calls required. .\"------------------------------- .SS Control message interface With the control message interface, the RDMA cookie is passed to the server out-of-band, included in an extension header attached to the RDS message. .PP The following outlines the mode of operation; the data types used will be specified in details in a subsequent section. .PP Initially, the client will send RDMA requests along with a .B RDS_CMSG_RDMA_MAP control message. The control message contains the address and length of the memory region for which to obtain a handle, some flags, and a pointer to a memory location (in the caller's address space) where the kernel will store the RDMA cookie. .PP Alternatively, if the application has already obtained a RDMA cookie for the memory range it wants to RDMA to/from, it can hand this cookie to the kernel using the .B RDS_CMSG_RDMA_DEST control message. .PP Either way, the kernel will include the resulting RDMA cookie in an extension header that is transmitted as part of the RDMA request to the server. .PP When the server receives the RDMA request, the kernel will deliver the cookie wrapped inside a .B RDS_CMSG_RDMA_DEST control message. .PP The server then initiates the data transfer by sending the RDMA ACK message along with a .B RDS_CMSG_RDMA_ARGS control message. This message contains the RDMA cookie, and the local memory to copy to or from. .PP The server process may request a notification when an RDMA operation completes. Notifications are delivered as a .B RDS_CMSG_RDMA_STATUS control messages. When an application calls .BR recvmsg (2), it will either receive a regular RDS message (possibly with other RDMA related control messages), or an empty message with one or more status control messages. .PP In addition, applications When an RDMA operation fails for some reason and is discarded, the application can ask to receive notifications for failed messages as well, regardless of whether it asked for success notification of an individual message or not. This behavior is turned on by setting the .B RDS_RECVERR socket option. .\"------------------------------- .SS Setsockopt interface In addition to the control message interface, RDS allows a process to register and release memory ranges for RDMA through calls to .BR setsockopt (2). .TP .B RDS_GET_MR To obtain a RDMA cookie for a given memory range, the application can use .BR setsockopt " with " RDS_GET_MR . This operates essentially the same way as the .B RDS_CMSG_RDMA_MAP control message: the argument contains the address and length of the memory range to be registered, and a pointer to a RDMA cookie variable, in which the system call will store the cookie for the registered range. .TP .B RDS_FREE_MR Memory ranges can be released by calling .BR setsockopt " with " RDS_FREE_MR , giving the RDMA cookie and additional flags as arguments. .TP .B RDS_RECVERR This is a boolean option which can be set as well as queried (using .BR getsockopt ). When enabled, RDS will send RDMA notification messages to the application for any RDMA operation that fails. This option defaults to off. .PP For all of these calls, the .B level argument to .B setsockopt is .BR SOL_RDS . .PP .\"------------------------------- .SH RDMA MACROS AND TYPES .fi .TP .B RDMA cookie .nf typedef u_int64_t rds_rdma_cookie_t .fi .IP This encapsulates a memory location in the client process. In the current implementation, it contains the R_Key of the remote memory region, and the offset into it (so that the application does not have to worry about alignment. .IP The RDMA cookie is used in several struct types described below. The .BR RDS_CMSG_RDMA_DEST control message contains a rds_rdma_cookie_t all by itself as payload. .TP .B Mapping arguments The following data type is used with .B RDS_CMSG_RDMA_MAP control messages and with the .B RDS_GET_MR socket option: .IP .nf struct rds_iovec { u_int64_t addr; u_int64_t bytes; }; struct rds_get_mr_args { struct rds_iovec vec; u_int64_t cookie_addr; uint64_t flags; }; .fi .IP The .B cookie_addr specifies a memory location where to store the RDMA cookie. .IP The .B flags value is a bitwise OR of any of the following flags: .RS .TP .B RDS_RDMA_USE_ONCE This tells the kernel that the allocated RDMA cookie is to be used exactly once. When the RDMA ACK message arrives, the kernel will automatically unbind the memory area and release any resources associated with the cookie. .IP If this flag is not set, it is the application's responsibility to release the memory region at a later time using the .BR RDS_FREE_MR socket option. .TP .B RDS_RDMA_INVALIDATE Normally, RDMA memory mappings are invalidated lazily, as this requires some relatively costly synchronization with the HCA. However, this means that the server application can continue to access the registered memory for some indeterminate amount of time. If this flag is set, the RDS code will invalidate the mapping at the time it is released (either upon arrival of the RDMA ACK, if .B USE_ONCE was specified; or when the application destroys it using .BR FREE_MR ). .RE .TP .B RDMA Operation RDMA operations are initiated by the server using the .BR RDS_CMSG_RDMA_ARGS control message, which takes the following data as payload: .IP .nf struct rds_rdma_args { rds_rdma_cookie_t cookie; struct rds_iovec remote_vec; u_int64_t local_vec_addr; u_int64_t nr_local; u_int64_t flags; u_int32_t user_token; }; .fi .IP The .B cookie argument contains the RDMA cookie received from the client. The local memory is given via an array of .BR rds_iovec s. The array address is given in .BR local_vec_addr , and its number of elements is given in .BR nr_local . .IP The struct member .B remote_vec specifies a location relative to the memory area identified by the cookie: .BR remote_vec . addr is an offset into that region, and .BR remote_vec . bytes is the length of the memory window to copy to/from. This length must match the size of the local memory area, i.e. the sum of bytes in all members of the local iovec. .IP The flags field contains the bitwise OR of any of the following flags: .RS .TP .B RDS_RDMA_READWRITE If set, any RDMA WRITE is initiated from the server's memory to the client's. If not set, RDS will do a RDMA READ from the client's memory to the server's memory. .TP .B RDS_RDMA_FENCE By default, Infiniband makes no guarantee about the ordering of an RDMA READ with respect to subsequent SEND operations. Setting this flag asks that the RDMA READ should be fenced off the subsequent RDS ACK message. Setting this flag requires an additional round-trip of the IB fabric, but it is a good idea to use set this flag by default, unless you are really sure you do not want it. .TP .B RDS_RDMA_NOTIFY_ME This flag requests a notification upon completion of the RDMA operation (successful or otherwise). The noticiation will contain the value of the .B user_token field passed in by the application. This allows the application to release resources (such as buffers) assosicated with the RDMA transfer. .RE .IP The .B user_token can be used to pass an application specific identifier to the kernel. This token is returned to the application when a status notification is generated (see the following section). .TP .B RDMA Notification The RDS kernel code is able to notify the server application when an RDMA operation completes. These notifications are delivered via .B RDS_CMSG_RDMA_STATUS control messages. .IP By default, no notifications are generated. There are two ways an application can request them. On one hand, status notifications can be enabled on a per-operation basis by setting the .B RDS_RDMA_NOTIFY_ME flag in the RDMA arguments. On the other hand, the application can request notifications for all RDMA operations that fail by setting the .B RDS_RECVERR socket option (see below). In both cases, the format of the notification is the same; and at most one notification will be sent per completed operation. .IP The message format is this: .IP .nf struct rds_rdma_notify { u_int32_t user_token; int32_t status; }; .fi .IP The .B user_token field contains the value previously given to the kernel in the .BR RDS_CMSG_RDMA_ARGS control message. The .BR status field contains a status value, with 0 indicating success, and non-zero indicating an error. .IP The following status codes are currently defined: .RS .TP .B RDS_RDMA_SUCCESS The RDMA operation succeeded. .TP .B RDS_RDMA_REMOTE_ERROR The RDMA operation failed due to a remote access error. This is usually due to an invalid R_key, offset or transfer size. .TP .B RDS_RDMA_CANCELED The RDMA operation was canceled by the application. (This error code is not yet generated). .TP .B RDS_RDMA_DROPPED RDMA operations were discarded after the connection broke and was re-established. The RDMA operation may have been processed partially. .TP .B RDS_RDMA_OTHER_ERROR Any other failure. .RE .TP .B RDMA setsockopt arguments When using the .B RDS_GET_MR socket option to register a memory range, the application passes a pointer to a .B struct rds_get_mr_args variable, described above. .IP The .B RDS_FREE_MR call takes an argument of type .BR "struct rds_free_mr_args" : .IP .nf struct rds_free_mr_args { rds_rdma_cookie_t cookie; u_int64_t flags; }; .fi .IP .B cookie specifies the RDMA cookie to be released. RDMA access to the memory range will usually not be invoked instantly, because the operation is rather costly. However, if the .B flags argument contains .BR RDS_RDMA_INVALIDATE , RDS will invalidate the indicated mapping immediately, as described in section .B "Mapping arguments" above. .IP If the .B cookie argument is 0, and .BR RDS_RDMA_INVALIDATE is set, RDS will invalidate old memory mappings on all devices. .\"------------------------------- .SH ERRORS In addition to the usual error codes returned by .BR sendmsg ", " recvmsg " and " setsockopt , RDS returns the following error codes: .TP .BR EAGAIN RDS was unable to map a memory range because the limit was exceeded (returned by .BR RDS_CMSG_RDMA_MAP " and " RDS_GET_MR ). .TP .BR EINVAL When sending a message, there were were conflicting control messages (e.g. two .B RDMA_MAP messages, or a .B RDMA_MAP " and a " RDMA_DEST message). .IP In a .BR RDS_CMSG_RDMA_MAP " or " RDS_GET_MR operation, the application specified memory range greater than the maximum size supported. .IP When setting up an RDMA operation with .BR RDS_CMSG_RDMA_ARGS , the size of the local memory (given in the .BR rds_iovec ) did not match the size of the remote memory range. .TP .B EBUSY RDS was unable to obtain a DMA mapping for the indicated memory. .\"------------------------------- .SH LIMITS Currently, the following limits apply .IP \(bu The maximum size of a zerocopy transfer is 1MB. This can be adjusted via the .B fmr_message_size module parameter. .IP \(bu The maximum number of memory ranges that can be mapped is limited to 2048 at the moment. This can be adjusted via the .B fmr_pool_size module parameter. However, the actual limit imposed by the hardware may in fact be lower. .SH AUTHORS RDS was written and is Copyright (C) 2007-2008 by Oracle, Inc. trunk/rds-sink.10000644000175000017500000000002311313644724013357 0ustar benoitbenoit.so man1/rds-gen.1 trunk/rds-sink.c0000644000175000017500000001245511313644724013455 0ustar benoitbenoit/* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ /* -*- mode: c; c-basic-offset: 8; -*- * vim: noexpandtab sw=8 ts=8 sts=0: * * rds-sink.c: Collect some RDS packets. */ #define _LARGEFILE64_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include "kernel-list.h" #include "rdstool.h" void print_usage(int rc) { int namelen = strlen(progname); FILE *output = rc ? stderr : stdout; verbosef(0, output, "Usage: %s -s :\n" " %*s [-f ] [-i ]\n" " %*s [-v ...] [-q ...]\n" " %s -h\n" " %s -V\n", progname, namelen, "", namelen, "", progname, progname); exit(rc); } void print_version() { verbosef(0, stdout, "%s version VERSION\n", progname); exit(0); } static int empty_buff(struct rds_context *ctxt, char *bytes, ssize_t len) { int ret = 0; char *ptr = bytes; if (!ctxt->rc_filename) len = 0; /* Throw it away */ while (len && runningp()) { stats_print(); ret = write(STDOUT_FILENO, ptr, len); if (!ret) { verbosef(0, stderr, "%s: Unexpected end of file writing to %s\n", progname, ctxt->rc_filename); ret = -EPIPE; break; } if (ret < 0) { ret = -errno; if (ret == -EINTR) continue; verbosef(0, stderr, "%s: Error writing to %s: %s\n", progname, ctxt->rc_filename, strerror(-ret)); break; } stats_add_write(ret); ptr += ret; len -= ret; ret = 0; } return ret; } static ssize_t recv_buff(struct rds_endpoint *e, struct msghdr *msg, int flags) { ssize_t ret = 0; while (runningp()) { stats_print(); ret = recvmsg(e->re_fd, msg, flags); if (ret < 0) { ret = -errno; if (ret == -EINTR) continue; verbosef(0, stderr, "%s: Error from recvmsg: %s\n", progname, strerror(-ret)); } /* Success */ break; } return ret; } static int wli_do_recv(struct rds_context *ctxt) { struct rds_endpoint *e = ctxt->rc_saddr; ssize_t alloced = 0; ssize_t ret = 0; struct iovec iov = { .iov_base = NULL, }; struct msghdr msg = { .msg_name = &e->re_addr, .msg_namelen = sizeof(struct sockaddr_in), .msg_iov = &iov, .msg_iovlen = 1, }; verbosef(2, stderr, "Starting receive loop\n"); stats_start(); while (runningp()) { /* Calls stats_print() */ iov.iov_len = 0; ret = recv_buff(e, &msg, MSG_PEEK|MSG_TRUNC); if (ret < 0) break; if (ret > alloced) { verbosef(3, stderr, "Growing buffer to %zd bytes\n", ret); iov.iov_base = realloc(iov.iov_base, ret); if (iov.iov_base == NULL) { ret = -ENOMEM; break; } alloced = ret; } /* Calls stats_print() */ iov.iov_len = ret; ret = recv_buff(e, &msg, 0); if (ret < 0) break; stats_add_recv(ret); /* Calls stats_print() */ ret = empty_buff(ctxt, iov.iov_base, ret); if (ret) break; } verbosef(2, stderr, "Stopping receive loop\n"); stats_total(); return ret; } int main(int argc, char *argv[]) { int rc; char ipbuf[INET_ADDRSTRLEN]; struct rds_context ctxt = { .rc_filename = "-", }; INIT_LIST_HEAD(&ctxt.rc_daddrs); rc = parse_options(argc, argv, RDS_TOOL_BASE_OPTS RDS_SINK_OPTS, &ctxt); if (rc) print_usage(rc); inet_ntop(PF_INET, &ctxt.rc_saddr->re_addr.sin_addr, ipbuf, INET_ADDRSTRLEN); verbosef(2, stderr, "Binding endpoint %s:%d\n", ipbuf, ntohs(ctxt.rc_saddr->re_addr.sin_port)); rc = rds_bind(&ctxt); if (rc) goto out; if (ctxt.rc_filename) { rc = dup_file(&ctxt, STDOUT_FILENO, O_CREAT|O_WRONLY); if (rc) goto out; if (!strcmp(ctxt.rc_filename, "-")) ctxt.rc_filename = ""; } setup_signals(); if (rc) { verbosef(0, stderr, "%s: Unable to initialize signals\n", progname); goto out; } rc = wli_do_recv(&ctxt); out: free(ctxt.rc_saddr->re_name); free(ctxt.rc_saddr); return rc; } trunk/pfhack.c0000644000175000017500000000640411313644724013154 0ustar benoitbenoit/* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ /* -*- mode: c; c-basic-offset: 8; -*- * vim: noexpandtab sw=8 ts=8 sts=0: * * pfhack.c - discover the RDS constants * * PF_RDS and SOL_RDS should be assigned constants. However, we don't have * official values yet. There is a hack to overload an existing PF_ value * (21). This dynamic code detects what the running kernel is using. */ #include #include #include #include #include #include #include #include #include #include "kernel-list.h" #include "pfhack.h" #include "rdstool.h" #define PF_RDS_PATH "/proc/sys/net/rds/pf_rds" #define SOL_RDS_PATH "/proc/sys/net/rds/sol_rds" /* We don't allow any system that can't read pf_rds */ static void explode(const char *reason) { fprintf(stderr, "%s: Unable to determine RDS constant: %s\n", progname, reason); exit(1); } static int discover_constant(const char *path, int official, int *found) { int fd; ssize_t ret, total = 0; char buf[PATH_MAX]; char *ptr; long val; if (*found >= 0) return *found; fd = open(path, O_RDONLY); if (fd < 0) { /* hmm, no more constants in /proc. we must not need it anymore * so use official values. */ *found = official; return official; } while (total < sizeof(buf)) { ret = read(fd, buf + total, sizeof(buf) - total); if (ret > 0) total += ret; else break; } close(fd); if (ret < 0) explode("Error reading address constant"); val = strtoul(buf, &ptr, 0); if ((val > INT_MAX) || !ptr || (*ptr && (*ptr != '\n'))) explode("Invalid address constant"); *found = val; return (int)val; } int discover_pf_rds() { static int pf_rds = -1; return discover_constant(PF_RDS_PATH, OFFICIAL_PF_RDS, &pf_rds); } int discover_sol_rds() { static int sol_rds = -1; return discover_constant(SOL_RDS_PATH, OFFICIAL_SOL_RDS, &sol_rds); } trunk/pfhack.h0000644000175000017500000000406711313644724013164 0ustar benoitbenoit/* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ /* -*- mode: c; c-basic-offset: 8; -*- * vim: noexpandtab sw=8 ts=8 sts=0: * * pfhack.h - discover the RDS constants * * PF_RDS and SOL_RDS should be assigned constants. However, we don't have * official values yet. There is a hack to overload an existing PF_ value * (21). This dynamic code detects what the running kernel is using. */ #ifndef __PF_HACK_H #define __PF_HACK_H #define OFFICIAL_PF_RDS 21 #define OFFICIAL_SOL_RDS 276 #ifdef DYNAMIC_PF_RDS extern int discover_pf_rds(); extern int discover_sol_rds(); #define AF_RDS discover_pf_rds() #define PF_RDS AF_RDS #define SOL_RDS discover_sol_rds() #endif /* DYNAMIC_PF_RDS */ #endif /* __PF_HACK_H */